import requests from urllib.parse import quote import subprocess import os import sys import re speaker_id="p230" SUBSTITUTIONS = { " - ": "--", "Ave.": "Avenue,", "Co.": "Company", } letter_series = sys.argv[1] or "buffett-partnership" letter_year = sys.argv[2] or "1956" real_run = (len(sys.argv) == 4) and bool(sys.argv[3]) or False fn = f"./{letter_series}/{letter_year}/1-in/bpl-{letter_year}-letter.txt" pre_fn = f"./{letter_series}/{letter_year}/2-preprocess/bpl-{letter_year}.txt" output_dir = f"./{letter_series}/{letter_year}/3-wavs" subprocess.run(['mkdir', '-p', output_dir]) if (not real_run): print("Dry run, check that sentences are readable.") def synth_sentence(sentence, i): text_prompt = quote(sentence) # If hosted on IPv6 use http://[::1]:5002 # If hosted on IPv4 use http://localhost:5002 query_string = f"http://\[::1\]:5002/api/tts?text={text_prompt}&style_wav=&language_id=&speaker_id={speaker_id}" print(f"Query string {query_string}") subprocess.run(['curl', query_string, '-o', f"./{output_dir}/output-{str(i).zfill(3)}.wav"]) with open(fn) as f: lines = f.readlines() # Do substitutions first, because often they affect sentence splitting for i in range(len(lines)): for (orig,subst) in SUBSTITUTIONS.items(): lines[i] = lines[i].replace(orig, subst) all_text = ''.join(lines) sentence_ends = list(re.finditer(r"(\w\w+([\.:;][\s\n])|([\s\n][\s\n]+))", all_text)) start = 0 count = len(sentence_ends) print(f"{count} sentences found.\n") for (i, sentence_end) in enumerate(sentence_ends): sentence = all_text[start:sentence_end.end()].strip() # Heal any newlines due to awkward breaks from copy-pasta sentence = " ".join(map(lambda x: x.strip(), sentence.split("\n"))) print(f"Sentence {i}:\n\t\"{sentence}\"\n\t{sentence_end}") start = sentence_end.end() # no +1 needed since end is already one past the real ending character if len(sentence.strip()) == 0: print(f"Empty sentence {i} made it through somehow. Continuing...") if (real_run): synth_sentence(sentence, i)