crawl the title and price of goods in Amazon China, Mobile phone-> Mobile Communications-> Apple Phone.
its URL= https://www.amazon.cn/s/ref=s.
my python code is as follows:
import requests
from bs4 import BeautifulSoup
import re -sharpHTML
import time -sharp
-sharp
def get_total_page_number():
user_agent = "Mozilla/5.0 (Windows NT 6.3;WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/45.0.2454.101 Safari/537.36"
headers = {"User-Agent":user_agent} -sharp
-sharp URL
url = "https://www.amazon.cn/s/ref=sa_menu_digita_l3_siphone?ie=UTF8&page=1&rh=n%3A665002051%2Cp_89%3AApple%2Cn%3A664978051"
res = requests.get(url,headers=headers) -sharp
html = res.text
html_soup = BeautifulSoup(html,"lxml") -sharpsouphtml
page_number_span = html_soup.find("h2",id="s-result-count") -sharpid="s-result-count"h2
page_number_code = page_number_span.text -sharp
number_list = re.findall(r"(\w*[0-9]+)\w",page_number_code) -sharp3
total_page_number = (int(number_list[-1])/int(number_list[-2])+1) -sharp
return int(total_page_number) -sharp
-sharp
def parse_single_page(i):
url_part1 = "https://www.amazon.cn/s/ref=sa_menu_digita_l3_siphone?ie=UTF8&page=%d" % i -sharpURL
url_part2 = "&rh=n%3A665002051%2Cp_89%3AApple%2Cn%3A664978051" -sharpURL
url = url_part1 + url_part2 -sharpURL
print ("prase url: %s" % url) -sharpURL
user_agent = "Mozilla/5.0 (Windows NT 6.3;WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/45.0.2454.101 Safari/537.36"
res = requests.get(url,headers=headers) -sharp
html = res.text
html_soup = BeautifulSoup(html,"lxml") -sharpsouphtml
tag_list = html_soup.find_all("li", id=re.compile("^result.*")) -sharpidresultli
-sharp()
for tag_info in tag_list:
-sharp
print (tag_info)
price_code = tag_info.find("span", class_="a-size-base a-color-price s-price a-text-bold")
-sharp
if price_code != None:
-sharp
title_code = tag_info.find("h2") -sharp
title = title_code.text -sharp
write_data(title,price) -sharp
-sharp
def write_data(title,price):
file_data = time.strftime("%Y-%m-%d",time.localtime(time.time())) -sharp
fn = open("%s.txt" % file_data,"a+") -sharp
content = title + "\t" + price + "\n" -sharptab
fn.write(content) -sharp
fn.close()
-sharp
def main():
total_page_number = get_total_page_number() -sharp
for i in range(1,int(total_page_number)+1):
parse_single_page(i)
main()
the error reported is as follows:
AttributeError Traceback (most recent call last)
<ipython-input-5-5527ff76ca42> in <module>()
51 parse_single_page(i)
52
---> 53 main()
<ipython-input-5-5527ff76ca42> in main()
47 -sharp
48 def main():
---> 49 total_page_number = get_total_page_number() -sharp
50 for i in range(1,int(total_page_number)+1):
51 parse_single_page(i)
<ipython-input-5-5527ff76ca42> in get_total_page_number()
9 html_soup = BeautifulSoup(html,"lxml") -sharpsouphtml
10 page_number_span = html_soup.find("h2",id="s-result-count") -sharpid="s-result-count"h2
---> 11 page_number_code = page_number_span.text -sharp
12 number_list = re.findall(r"(\w*[0-9]+)\w",page_number_code) -sharp3
13 total_page_number = (int(number_list[-1])/int(number_list[-2])+1) -sharp
AttributeError: "NoneType" object has no attribute "text"
I have solved some problems, but I still can"t solve this problem if I look it up on the Internet. I ask for God"s help, thank you!
I have comments on almost every line. I hope I can help God read it effectively. Thank you!