import requests from urllib.parse import quote import subprocess import os import sys import re speaker_id="p230" SUBSTITUTIONS = { " - ": "--", "Ave.": "Avenue,", "Co.": "Company", } letter_series = sys.argv[1] or "buffett-partnership" letter_year = sys.argv[2] or "1956" fn_stem = (len(sys.argv) == 4) and bool(sys.argv[3]) or f"{letter_series}-{letter_year}" real_run = (len(sys.argv) == 5) and bool(sys.argv[4]) or False fn = f"./{letter_series}/{letter_year}/1-in/{fn_stem}-letter.txt" pre_fn = f"./{letter_series}/{letter_year}/2-preprocess/{fn_stem}.txt" output_dir = f"./{letter_series}/{letter_year}/3-wavs" subprocess.run(['mkdir', '-p', output_dir]) if (not real_run): print("Dry run, check that sentences are readable.") def synth_sentence(sentence, i): text_prompt = quote(sentence) # If hosted on IPv6 use http://[::1]:5002 # If hosted on IPv4 use http://localhost:5002 query_string = f"http://\[::1\]:5002/api/tts?text={text_prompt}&style_wav=&language_id=&speaker_id={speaker_id}" print(f"Query string {query_string}") subprocess.run(['curl', query_string, '-o', f"./{output_dir}/output-{str(i).zfill(3)}.wav"]) with open(fn) as f: lines = f.readlines() skip_flag = False table_flag = False table_header_flag = False table_headers = [] # Do substitutions first, because often they affect sentence splitting for i in range(len(lines)): if ((not skip_flag) and (lines[i].strip() == "[SKIP]")): skip_flag = True lines[i] = "" elif (skip_flag): if (lines[i].strip() == "[/SKIP]"): skip_flag = False else: lines[i] = "" if ((not table_flag) and (lines[i].strip() == "[TABLE]")): table_flag = True lines[i] = "" # Empty lines are pruned below elif (table_flag): if (not table_header_flag): table_header_flag = True table_headers = list(map(lambda x: x.replace(".", ""), lines[i].strip().split(" & "))) lines[i] = "" print(table_headers) elif (lines[i].strip() == "[/TABLE]"): table_flag = False table_header_flag = False lines[i] = "" # Empty lines are pruned below else: # Process normal table row table_row = lines[i].strip().split(" ") if (len(table_row) != len(table_headers)): print((f"Line {i}: Malformed table, table body row had {len(table_row)} fields, " f"but table header row had {len(table_headers)} fields.")) exit(1) processed_row = [ f"{header} {value}" for (header, value) in zip(table_headers, table_row) ] lines[i] = "; ".join(processed_row) + " ;\n" print(lines[i]) for (orig,subst) in SUBSTITUTIONS.items(): lines[i] = lines[i].replace(orig, subst) all_text = ''.join(lines) sentence_ends = list(re.finditer(r"(\w\w+([\.:;][\s\n])|([\s\n][\s\n]+)|(\s;\n))", all_text)) start = 0 count = len(sentence_ends) print(f"{count} sentences found.\n") for (i, sentence_end) in enumerate(sentence_ends): sentence = all_text[start:sentence_end.end()].strip() # Heal any newlines due to awkward breaks from copy-pasta sentence = " ".join(map(lambda x: x.strip(), sentence.split("\n"))) print(f"Sentence {i}:\n\t\"{sentence}\"\n\t{sentence_end}") start = sentence_end.end() # no +1 needed since end is already one past the real ending character if len(sentence.strip()) == 0: print(f"Empty sentence {i} made it through somehow. Continuing...") continue if (real_run): synth_sentence(sentence, i)