I'm not a Python specialist, so bear with me. I'm trying to replace a Perl HTML::TokeParser based parser that I use for template foreign language translation to use Python html.parser. Here's the prototype code which nearly gives me what I want.
import deepl from html.parser import HTMLParser class MyHTMLParser(HTMLParser): def handle_starttag(self, tag, attrs): result = '<' + tag + '>' print('start ' + str(result)) for attr in attrs: print(" attr:", attr) def handle_endtag(self, tag): result = '</' + tag + '>' print('end ' + str(result)) #print("End tag :", tag) def handle_data(self, data): self.translate_data(data) #print("Data :", data) etc. etc. and
deepl_client = deepl.DeepLClient(auth_key) #Translate a formal document from English to French input_path = "blabla" output_path = "blabla" parser = MyHTMLParser() with open(input_path, 'r') as file: content = file.read() parser.feed(content) However I'd also like access to the raw HTML as it goes through the feed to avoid re-assembling the simpler or non-translated tags.