problem description
use pyspider to crawl data today. The page needs to both turn the page and get the data on the page
URL is http://www.cstc.org.cn/temple..
wrote a total of two versions of the code, version 1 will cause the web page to report too many connections errors when crawling data for half a day.
version 2 has the bug of pyspider (version 2 does not use the orthodox data storage method, which may be related to this)
related codes
/ / Please paste the code text below (do not replace the code with pictures)
version 1
from pyspider.libs.base_handler import *
from pyspider.database.mysql.mysqldb3 import MySQL
class Handler (BaseHandler):
crawl_config = {
"headers":{
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive"
}
}
def on_start(self):
for i in range(1,3096):
url = "http://www.cstc.org.cn/templet/default/rjdjcs.jsp?type=%C8%ED%BC%FE%B2%FA%C6%B7%B5%C7%BC%C7%B2%E2%CA%D4&id=1218&tdsourcetag=s_pcqq_aiomsg&"+"pn=%d" % i
self.crawl(url,callback=self.detail_page)
@config(priority=2)
def detail_page(self, response):
ll = [];
for i in response.doc("tbody>tr").items():
arr=[]
for td in i.find("td"):
arr.append(td.text)
ll.append(arr)
return {
"ll":ll,
}
def on_result(self, result):
if not result or not result["ll"]:
return
for l in result["ll"]:
dic={
"qymc":l[0],
"reportName":l[1],
"reportId":l[2]
}
MySQL().replace("Test_notice_all",**dic)
related codes
/ / Please paste the code text below (do not replace the code with pictures)
version 2
from pyspider.libs.base_handler import *
from pyspider.database.mysql.mysqldb3 import MySQL
class Handler (BaseHandler):
crawl_config = {
"headers":{
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive"
}
}
def on_start(self):
url = "http://www.cstc.org.cn/templet/default/rjdjcs.jsp?type=%C8%ED%BC%FE%B2%FA%C6%B7%B5%C7%BC%C7%B2%E2%CA%D4&id=1218&tdsourcetag=s_pcqq_aiomsg&"+"pn=%d" % 1
self.crawl(url,callback=self.detail_page,save=1)
@config(priority=2)
def detail_page(self, response):
ll = [];
page = int(response.save)
new_page = page +1;
for i in response.doc("tbody>tr").items():
arr=[]
for td in i.find("td"):
arr.append(td.text)
arr.append(page)
ll.append(arr)
self.parseResult(ll)
if(new_page<3096):
url = "http://www.cstc.org.cn/templet/default/rjdjcs.jsp?type=%C8%ED%BC%FE%B2%FA%C6%B7%B5%C7%BC%C7%B2%E2%CA%D4&id=1218&tdsourcetag=s_pcqq_aiomsg&"+"pn=%d" % new_page
self.crawl(url,callback=self.detail_page,save=new_page)
def parseResult(self, ll):
if not ll:
return
for l in ll:
dic={
"qymc":l[0],
"reportName":l[1],
"reportId":l[2],
"page":l[3]
}
MySQL().replace("Test_notice_all_no_error",**dic)
what result do you expect? What is the error message actually seen?
problem description
problems with version 1:
p.s. 1
2:
:
ask: for this kind of web page which not only needs page data, but also needs to turn the page, how to crawl better?