parent
e567ed223a
commit
0d168b2aa9
@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
|
||||
fn = sys.argv[1]
|
||||
out_fn = sys.argv[2]
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def strip_html_tags(html):
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Remove script tags
|
||||
for script in soup.find_all('script'):
|
||||
script.extract()
|
||||
|
||||
# Extract text from remaining tags
|
||||
text = soup.get_text()
|
||||
|
||||
# Remove leading/trailing white spaces and newlines
|
||||
text = text.strip()
|
||||
|
||||
return text
|
||||
|
||||
def read_file(file_path):
|
||||
with open(file_path, 'rb') as file:
|
||||
byte_sequence = file.read()
|
||||
while True:
|
||||
try:
|
||||
html_content = byte_sequence.decode('utf-8')
|
||||
break # Decoding successful, exit the loop
|
||||
except UnicodeDecodeError as e:
|
||||
byte_position = e.args[2]
|
||||
byte_sequence = byte_sequence[:byte_position] + byte_sequence[byte_position+1:]
|
||||
|
||||
return html_content
|
||||
|
||||
# Read the HTML file
|
||||
with open(fn, 'r') as file:
|
||||
html_content = read_file(fn)
|
||||
stripped_text = strip_html_tags(html_content)
|
||||
with open(out_fn, "w") as of:
|
||||
of.write(stripped_text)
|
||||
of.flush()
|
||||
Loading…
Reference in new issue