diff --git a/clean_html.py b/clean_html.py
new file mode 100644
index 0000000..0c377bd
--- /dev/null
+++ b/clean_html.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+import sys
+
+fn = sys.argv[1]
+out_fn = sys.argv[2]
+
+from bs4 import BeautifulSoup
+
+def strip_html_tags(html):
+ soup = BeautifulSoup(html, 'html.parser')
+
+ # Remove script tags
+ for script in soup.find_all('script'):
+ script.extract()
+
+ # Extract text from remaining tags
+ text = soup.get_text()
+
+ # Remove leading/trailing white spaces and newlines
+ text = text.strip()
+
+ return text
+
+def read_file(file_path):
+ with open(file_path, 'rb') as file:
+ byte_sequence = file.read()
+ while True:
+ try:
+ html_content = byte_sequence.decode('utf-8')
+ break # Decoding successful, exit the loop
+ except UnicodeDecodeError as e:
+ byte_position = e.args[2]
+ byte_sequence = byte_sequence[:byte_position] + byte_sequence[byte_position+1:]
+
+ return html_content
+
+# Read the HTML file
+with open(fn, 'r') as file:
+ html_content = read_file(fn)
+ stripped_text = strip_html_tags(html_content)
+ with open(out_fn, "w") as of:
+ of.write(stripped_text)
+ of.flush()
diff --git a/speak.py b/speak.py
index d7e940c..2dfcdfc 100644
--- a/speak.py
+++ b/speak.py
@@ -19,7 +19,7 @@ fn_stem = (len(sys.argv) == 4) and bool(sys.argv[3]) or f"{letter_series}-{lette
real_run = (len(sys.argv) == 5) and bool(sys.argv[4]) or False
fn = f"./{letter_series}/{letter_year}/1-in/{fn_stem}-letter.txt"
-pre_fn = f"./{letter_series}/{letter_year}/2-preprocess/{fn_stem}.txt"
+pre_fn = f"./{letter_series}/{letter_year}/2-preprocessed/{fn_stem}-letter.txt"
output_dir = f"./{letter_series}/{letter_year}/3-wavs"
@@ -36,7 +36,7 @@ def synth_sentence(sentence, i):
print(f"Query string {query_string}")
subprocess.run(['curl', query_string, '-o', f"./{output_dir}/output-{str(i).zfill(3)}.wav"])
-with open(fn) as f:
+with open(pre_fn) as f:
lines = f.readlines()
skip_flag = False