parent
e567ed223a
commit
0d168b2aa9
@ -0,0 +1,43 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import sys
|
||||||
|
|
||||||
|
fn = sys.argv[1]
|
||||||
|
out_fn = sys.argv[2]
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
def strip_html_tags(html):
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
|
||||||
|
# Remove script tags
|
||||||
|
for script in soup.find_all('script'):
|
||||||
|
script.extract()
|
||||||
|
|
||||||
|
# Extract text from remaining tags
|
||||||
|
text = soup.get_text()
|
||||||
|
|
||||||
|
# Remove leading/trailing white spaces and newlines
|
||||||
|
text = text.strip()
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
def read_file(file_path):
|
||||||
|
with open(file_path, 'rb') as file:
|
||||||
|
byte_sequence = file.read()
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
html_content = byte_sequence.decode('utf-8')
|
||||||
|
break # Decoding successful, exit the loop
|
||||||
|
except UnicodeDecodeError as e:
|
||||||
|
byte_position = e.args[2]
|
||||||
|
byte_sequence = byte_sequence[:byte_position] + byte_sequence[byte_position+1:]
|
||||||
|
|
||||||
|
return html_content
|
||||||
|
|
||||||
|
# Read the HTML file
|
||||||
|
with open(fn, 'r') as file:
|
||||||
|
html_content = read_file(fn)
|
||||||
|
stripped_text = strip_html_tags(html_content)
|
||||||
|
with open(out_fn, "w") as of:
|
||||||
|
of.write(stripped_text)
|
||||||
|
of.flush()
|
||||||
Loading…
Reference in new issue