put the code first
-sharp coding=utf-8
from bs4 import BeautifulSoup
import requests
from time import sleep
-sharp
City_Name = "qd"
page = "http://newhouse.{0}.fang.com/house/s".format(City_Name)
-sharpDownload_Newitem_List()
def Download_Newitem_List(url,try_num=2):
global City_Name
print(":",url)
try:
all_html = requests.get(url,timeout=10)
except Exception as e:
print(":",e.reason)
all_html=None
if try_num >0:
if hasattr(e,"code") and 500 <= e.code <600:
return Download_Newitem_List(url,try_num-1)
all_html.encoding = "gb18030"
soup = BeautifulSoup(all_html.text, "html5lib")
-sharp
Item_Total = soup.find("a", id="allUrl").find("span").text.replace("(","").replace(")","")
-sharp020+1
if (int(Item_Total) % 20)>0:
Page_Num = (int(Item_Total) // 20) + 1
else:
Page_Num = (int(Item_Total) // 20)
with open("{0}_list_link.txt".format(City_Name), "w",encoding="utf-8") as f:
for i in range(1, Page_Num + 1):
New_Page_Link = "http://newhouse.{0}.fang.com/house/s/b9{1}".format(City_Name, i)
print(New_Page_Link)
print(New_Page_Link, file=f)
-sharpDownload_item_link(City)
def Download_item_link(City):
with open("{0}_list_link.txt".format(City), "r",encoding="utf-8") as f:
-sharpprint(f.readlines())
for line in f.readlines():
print(":", line)
sleep(2)
try:
all_html = requests.get(line,timeout=10)
all_html.encoding = "gb18030"
-sharpprint(all_html.text)
except Exception as e:
print(":", e)
-sharpif try_num > 0:
-sharp if hasattr(e, "code") and 500 <= e.code < 600:
-sharp return Download_Newitem_List(url, try_num - 1)
soup = BeautifulSoup(all_html.text, "html5lib")
master_html=soup.find_all("div", class_="nlcd_name")
with open("{0}_Newall_link.txt".format(City), "w",encoding="utf-8") as d:
for link in master_html:
-sharpprint(link.get_text().rstrip() + ":" + link.a["href"].rstrip())
print(link.a["href"].rstrip(),file=d)
Download_Newitem_List(page)
Download_item_link("qd")
the above code can be directly run in ide
take Qingdao as an example (qd) I extracted the total real estate project 482, 25 page links are also extracted, but when I use the Download_item_link () function to extract links to each project in the list, there is a problem, there should be 482 links in the qd_Newall_link.txt file, but how to do it is 20 links. I"ve been thinking about it for a long time, but I can"t figure out what the problem is.
I hope some great god can help me take a look.