Код самого парсера:
from urllib import request
from html.parser import HTMLParser, HTMLParseError
URL = "http://mds.podfm.ru/150/"
class MDSParser(HTMLParser):
inHeading = False
def handle_starttag(self, tag, attrs):
if tag == "h1":
self.inHeading = True
def handle_data(self, data):
if self.inHeading:
print (data)
def handle_endtag(self, tag):
if tag == "h1":
self.inHeading = False
def openPage(URL):
print("[-] Trying to open " + URL)
try:
quellcode = request.urlopen(URL).read()
print("[+] Successful")
return str(quellcode)
except:
print("[-] Error while opening URL")
def main():
toParse = openPage(URL)
# f = open("qc.txt".encode("utf-8"), "wb")
# f.write(toParse)
# f.close()
podcast = MDSParser()
try:
podcast.feed(toParse)
podcast.close()
except HTMLParseError as e:
print("[-] Error while parsing: " + str(e))
if __name__ == '__main__':
main()
malformed start tag, at line 1, column 430
А там => xb0. "\xd