-sharp!/usr/bin/env python
-sharp -*- encoding: utf-8 -*-
-sharp Created on 2018-05-22 15:22:51
-sharp Project: demo
from pyspider.libs.base_handler import *
import pymongo
class Handler(BaseHandler):
crawl_config = {
}
client = pymongo.MongoClient("localhost")
db = client["trip"]
@every(minutes=30)
def on_start(self):
self.crawl("https://cn.tripadvisor.com/Attractions-g186338-Activities-London_England.html", callback=self.index_page)
@config(age=60*20)
def index_page(self, response):
for each in response.doc(".attraction_type_group .listing_info div.listing_title a").items():
self.crawl(each.attr.href,callback=self.group_detail)
for each in response.doc("-sharpATTR_ENTRY_ > div.attraction_clarity_cell > div > div > div.listing_info > div.listing_title > a").items():
self.crawl(each.attr.href,callback=self.detail_page)
def group_detail(self,response):
for each in response.doc(".listing_title a").items():
self.crawl(each.attr.href, callback=self.detail_page)
print(response.doc(".current").text())
if response.doc(".current").text() == "1":
for next in response.doc(".pageNumbers a").items():
self.crawl(next.attr.href,callback=self.group_detail)
@config(priority=2)
def detail_page(self, response):
url = response.url
name = response.doc(".heading_title").text()
rating = response.doc(".autoResize").text()
address = response.doc(".address :nth-child(n+2)").text()
phone = response.doc(".blEntry.phone > span").text()
return {
"url":url,
"name": name,
"rating": rating,
"address":address,
"phone":phone,
}
def on_result(self,result):
if result:
self.save_to_mongo(result)
def save_to_mongo(self,result):
if self.db["shdu"].insert(result):
print("save to mongo",result)