Script for cleaning HTML.

3 years ago · 0d168b2aa9
parent e567ed223a
commit 0d168b2aa9
2 changed files with 45 additions and 2 deletions
--- a/clean_html.py
+++ b/clean_html.py
@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+import sys
+
+fn = sys.argv[1]
+out_fn = sys.argv[2]
+
+from bs4 import BeautifulSoup
+
+def strip_html_tags(html):
+    soup = BeautifulSoup(html, 'html.parser')
+
+    # Remove script tags
+    for script in soup.find_all('script'):
+        script.extract()
+
+    # Extract text from remaining tags
+    text = soup.get_text()
+
+    # Remove leading/trailing white spaces and newlines
+    text = text.strip()
+
+    return text
+
+def read_file(file_path):
+    with open(file_path, 'rb') as file:
+        byte_sequence = file.read()
+        while True:
+            try:
+                html_content = byte_sequence.decode('utf-8')
+                break  # Decoding successful, exit the loop
+            except UnicodeDecodeError as e:
+                byte_position = e.args[2]
+                byte_sequence = byte_sequence[:byte_position] + byte_sequence[byte_position+1:]
+
+    return html_content
+
+# Read the HTML file
+with open(fn, 'r') as file:
+    html_content = read_file(fn)
+    stripped_text = strip_html_tags(html_content)
+    with open(out_fn, "w") as of:
+        of.write(stripped_text)
+        of.flush()
--- a/speak.py
+++ b/speak.py
@ -19,7 +19,7 @@ fn_stem = (len(sys.argv) == 4) and bool(sys.argv[3]) or f"{letter_series}-{lette
 real_run = (len(sys.argv) == 5) and bool(sys.argv[4]) or False

 fn = f"./{letter_series}/{letter_year}/1-in/{fn_stem}-letter.txt"
-pre_fn = f"./{letter_series}/{letter_year}/2-preprocess/{fn_stem}.txt"
+pre_fn = f"./{letter_series}/{letter_year}/2-preprocessed/{fn_stem}-letter.txt"

 output_dir = f"./{letter_series}/{letter_year}/3-wavs"

@ -36,7 +36,7 @@ def synth_sentence(sentence, i):
    print(f"Query string {query_string}")
    subprocess.run(['curl', query_string, '-o', f"./{output_dir}/output-{str(i).zfill(3)}.wav"])

-with open(fn) as f:
+with open(pre_fn) as f:
    lines = f.readlines()

    skip_flag = False