questions are as follows:
-sharp -*- coding:utf-8 -*-
from urllib.request import urlopen
import bs4
import webbrowser
import requests
html_list = [] -sharp
html_list_txt = [] -sharp
movie_list = [] -sharp
movie_total = {} -sharp
html = "https://movie.douban.com/top250"
html_list.append(html)
-sharphtml2 = requests.get("https://movie.douban.com/top250")
-sharpwebbrowser.open(html)
-sharpwebbrowser.open(html)
html_txt = (urlopen(html)).read()
-sharphtml_txt = (requests.get(html)).text
bsObj = bs4.BeautifulSoup(html_txt, "html.parser")
print("---1---")
html_div = bsObj.find("div",{"class":"paginator"})
print("---2---")
html_a = html_div.findAll("a")
print("---3---")
for html_a_temp in html_a:
-sharp print(type(html_a_temp))
-sharpi = 2
-sharpwhile i <= 9:
-sharpprint(html_a_temp.get_text())
-sharpif html_a_temp.get_text != ">":
html_href = html_a_temp.attrs["href"]
html_href = "https://movie.douban.com/top250" + html_href
html_list.append(html_href)
-sharpi += 1
print("---4---")
html_list = list(set(html_list))
print(len(html_list))
-sharpprint(html_set)
-sharpprint(len(html_set))
-sharpprint(set(html_list))
-sharpprint(type(""))
-sharp
for html_list_temp in html_list:
""""""
html_read = bs4.BeautifulSoup(urlopen(html_list_temp).read(), "html.parser")
html_list_txt.append(html_read)
for html_page in html_list_txt:
name_div_list = html_page.findAll("div",{"class":"info"})
for name_div_temp in name_div_list:
name_div_inside = name_div_temp.findAll("div")
movie_name = name_div_inside[0].a.span.get_text() -sharp
name_div_star = name_div_temp.find("div",{"class":"star"})
name_div_star_span = name_div_star.findAll("span")
movie_score = name_div_star_span[1].get_text() -sharp
movie_number = name_div_star_span[3].get_text() -sharp
-sharp -----------
-sharpname_span_inq = name_div_temp.findAll("p")[1].span.get_text()
-sharpmovie_introduction = name_span_inq
-sharpname_span_inq = name_div_temp.find("span",{"class":"inq"})
print("----ttt----")
-sharpmovie_introduction = name_span_inq.get_text() -sharp
movie_total["name"] = movie_name
movie_total["score"] = movie_score
movie_total["number"] = movie_number
-sharpmovie_total["introduction"] = movie_introduction
movie_list.append(movie_total)
print(movie_list)
"""
name_div_inside_span_list = name_div_inside[1].div.findAll("span")
for name_div_inside_span_temp in name_div_inside_span_list:
movie_score = name_div_inside[1].div.span[1].get_text() -sharp
movie_number = name_div_inside[1].div.span[3].get_text() -sharp
"""
-sharpmovie_total[name] = movie_name
"""
name_div_list = html_page.findAll("div",{"class":"hd"})
for name_div_temp in name_div_list:
movie_name = name_div_temp.a.span.get_text()
movie_name_list.append(movie_name)
"""
-sharpprint(movie_name_list)
the running result is as follows:
I am still testing this crawler, and there are still some other problems that need to be solved.
but what I want to ask now is, when I climb Douban TOP250 movies, why do I grab one movie at a time, put it in the list, and finally print out all the same movie?
how can I change the code to make these movies a different movie?