You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

44 lines
1.1 KiB

#!/usr/bin/env python3
import sys
fn = sys.argv[1]
out_fn = sys.argv[2]
from bs4 import BeautifulSoup
def strip_html_tags(html):
soup = BeautifulSoup(html, 'html.parser')
# Remove script tags
for script in soup.find_all('script'):
script.extract()
# Extract text from remaining tags
text = soup.get_text()
# Remove leading/trailing white spaces and newlines
text = text.strip()
return text
def read_file(file_path):
with open(file_path, 'rb') as file:
byte_sequence = file.read()
while True:
try:
html_content = byte_sequence.decode('utf-8')
break # Decoding successful, exit the loop
except UnicodeDecodeError as e:
byte_position = e.args[2]
byte_sequence = byte_sequence[:byte_position] + byte_sequence[byte_position+1:]
return html_content
# Read the HTML file
with open(fn, 'r') as file:
html_content = read_file(fn)
stripped_text = strip_html_tags(html_content)
with open(out_fn, "w") as of:
of.write(stripped_text)
of.flush()