Script for cleaning HTML.

3 years ago · 0d168b2aa9
parent e567ed223a
commit 0d168b2aa9
2 changed files with 45 additions and 2 deletions
--- a/clean_html.py
+++ b/clean_html.py
@ -0,0 +1,43 @@
 #!/usr/bin/env python3
 import sys
 fn = sys.argv[1]
 out_fn = sys.argv[2]
 from bs4 import BeautifulSoup
 def strip_html_tags(html):
    soup = BeautifulSoup(html, 'html.parser')
    # Remove script tags
    for script in soup.find_all('script'):
        script.extract()
    # Extract text from remaining tags
    text = soup.get_text()
    # Remove leading/trailing white spaces and newlines
    text = text.strip()
    return text
 def read_file(file_path):
    with open(file_path, 'rb') as file:
        byte_sequence = file.read()
        while True:
            try:
                html_content = byte_sequence.decode('utf-8')
                break  # Decoding successful, exit the loop
            except UnicodeDecodeError as e:
                byte_position = e.args[2]
                byte_sequence = byte_sequence[:byte_position] + byte_sequence[byte_position+1:]
    return html_content
 # Read the HTML file
 with open(fn, 'r') as file:
    html_content = read_file(fn)
    stripped_text = strip_html_tags(html_content)
    with open(out_fn, "w") as of:
        of.write(stripped_text)
        of.flush()
--- a/speak.py
+++ b/speak.py
@ -19,7 +19,7 @@ fn_stem = (len(sys.argv) == 4) and bool(sys.argv[3]) or f"{letter_series}-{lette
 real_run = (len(sys.argv) == 5) and bool(sys.argv[4]) or False
 fn = f"./{letter_series}/{letter_year}/1-in/{fn_stem}-letter.txt"
-pre_fn = f"./{letter_series}/{letter_year}/2-preprocess/{fn_stem}.txt"
+pre_fn = f"./{letter_series}/{letter_year}/2-preprocessed/{fn_stem}-letter.txt"
 output_dir = f"./{letter_series}/{letter_year}/3-wavs"
@ -36,7 +36,7 @@ def synth_sentence(sentence, i):
    print(f"Query string {query_string}")
    subprocess.run(['curl', query_string, '-o', f"./{output_dir}/output-{str(i).zfill(3)}.wav"])
-with open(fn) as f:
+with open(pre_fn) as f:
    lines = f.readlines()
    skip_flag = False