Page flipping only collects the last piece of data on each page. What is wrong with it? Routed ~
import sys
sys.path.append("..")
from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from items import CnblogItem
from scrapy.http import Request
from scrapy.http import FormRequest
import scrapy
class ListSpider(CrawlSpider):
-sharp
name = "cnblog"
-sharp
allowed_domains = ["218.28.223.13/zzzfdc/"]
-sharp URL
start_urls = [
"http://218.28.223.13/zzzfdc/zhengzhou/permission.jsp?pn=&cn=&it=&pager.offset=0&page=1"
]
-sharp
def parse(self, response):
for sel in response.xpath("//table[@id="AutoNumber5"]//tr[@height="35"]"):
item = CnblogItem()
time = sel.xpath("td[2]/a/@href").extract_first()
item["time"] = time
url="http://218.28.223.13"+ item["time"]
page =response.selector.xpath("//center/font[@color="red"]/text()")[0].extract()
item["page"] =page
yield Request(url, callback=self.parse_info , dont_filter=True , meta={"page": page})
nextpage = response.xpath(u"//center/a[text()=""]/@href").extract()
if nextpage:
next = "http://218.28.223.13/zzzfdc/zhengzhou/" + nextpage[0]
print next
yield scrapy.http.Request(next, callback=self.parse, dont_filter=True)
def parse_info(self, response): -sharp def
for sel in response.xpath("//table[@bgcolor="-sharp66CC33"]"):
item = CnblogItem()
presellno = sel.xpath("tr[1]/td[2]/text()").extract_first()
item["presellno"] = presellno
verifydate = sel.xpath("tr[1]/td[4]/text()").extract_first()
item["verifydate"] = verifydate
developer = sel.xpath("tr[2]/td[2]/text()").extract_first()
item["developer"] = developer
projectname = sel.xpath("tr[3]/td[2]//span[1]/text()").extract_first()
item["projectname"] = projectname
location = sel.xpath("tr[4]/td[2]/text()").extract_first()
item["location"] = location
application = sel.xpath("tr[6]/td[1]/text()").extract_first()
item["application"] = application
count = sel.xpath("tr[6]/td[2]/div/text()").extract_first()
item["count"] = count
area = sel.xpath("tr[6]/td[3]/div/text()").extract_first()
item["area"] = area.replace("\r","").replace("\n","").replace(" ","").strip()
building = sel.xpath("tr[9]/td[2]/text()").extract_first()
item["building"] = building
item["page"] = response.meta["page"]
yield item