try:
req = urllib.request.Request(url, headers=hds[page_num%len(hds)])
source_code = urllib.request.urlopen(req).read()
plain_text=str(source_code)
except:
print ("Error.")
continue
soup = BeautifulSoup(plain_text, from_encoding="utf-8")
list_soup = soup.find("div", {"class": "mod book-list"})
try_times+=1;
if list_soup==None and try_times<200:
continue
elif list_soup==None or len(list_soup)<=1:
break -sharp Break when no informatoin got after 200 times requesting
for book_info in list_soup.findAll("dd"):
title = book_info.find("a", {"class":"title"}).string.strip()
desc = book_info.find("div", {"class":"desc"}).string.strip()
desc_list = desc.split("/")
book_url = book_info.find("a", {"class":"title"}).get("href")
-sharp
print(title)
the above code is a crawler crawling Douban. Why is the title (title), of the book I crawled a string of utf-8 codes instead of a normal string? As shown below:
since the obtained title (title) is still of type str, it cannot be decoded using the decode function. Is there any other way to solve it?