< H1 > attach the source code of the crawler file. < / H1 >
import scrapy
from openhub.items import OpenhubItem
from lxml import etree
import json
class ProjectSpider(scrapy.Spider):
name = "project"
-sharp allowed_domains = []
start_urls = ["https://www.openhub.net/p?page=1&query="]
url = "https://www.openhub.net/p?page={}&query="
page = 1
def parse(self, response):
div_list = response.xpath("//div[@class="well searchable"]")
pid = 0
for odiv in div_list:
-sharpitem
item = OpenhubItem()
-sharp
post = odiv.xpath(".//div/a[@class="pull-left logo"]/img/@src").extract_first()
pid = (pid+1)+(self.page*10)-10
item["pid"] = pid
if post:
item["post"] = post
else:
item["post"] = ""
licenses = odiv.xpath(".//div[@class="licenses pull-right"]").xpath("string(.)").extract()
lic = ",".join(licenses).replace("\n","").strip("Licenses:")
if lic:
item["licenses"] = lic
else:
item["licenses"] = "No declared licenses"
href = odiv.xpath(".//div/a[@class="pull-left logo"]/@href").extract_first()
item["openhub_url"] = "https://www.openhub.net" + href
time = odiv.xpath(".//div[@id="inner_content"]/div/p/i/abbr/@title").extract_first()
if time:
item["time"] = time
else:
item["time"] = ""
detail_url = item["openhub_url"]
lt = []
-sharp
with open("time222.txt", "r", encoding="utf-8") as f:
is_Existed = False
need_update = False
for lines in f.readlines():
line = lines.strip("\n")
line = json.loads(line)
lt.append(line)
-sharp if line.get("openhub_url") == item["url"]:
if item["openhub_url"] == line.get("openhub_url"):
is_Existed = True
if item["time"] != line.get("time"):
need_update = True
if (is_Existed == False) or (need_update == True):
yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item})
if self.page <= 47517:
self.page += 1
url = self.url.format(self.page)
yield scrapy.Request(url=url, callback=self.parse)
-sharp
def parse_detail(self, response):
-sharpresponsemetaitem
item = response.meta["item"]
item["title"] = response.xpath("//div[@id="project_header"]/div[1]/h1/a/text()").extract_first()
summary = response.xpath("//section[@id="project_summary"]").xpath("string(.)").extract_first().strip(" \n").replace("\n","")
if summary:
item["summary"] = summary
else:
item["summary"] = ""
score = response.xpath("//*[@id="i_use_this_container"]/div/a/text()").extract_first()
if score:
item["score"] = score
else:
item["score"] = ""
tags = response.xpath("//p[@class="tags"]").xpath("string(.)").extract_first().strip("\n")
if tags:
item["tags"] = tags
else:
item["tags"] = ""
info = response.xpath("//div[@class="well"]/ul").xpath("string(.)").extract_first().strip("\n").replace("\n","")
if info:
item["info"] = info
else:
item["info"] = ""
organiz_text = response.xpath("//div[@class="col-xs-12"]/div[@class="col-xs-7"][1]/a/text()").extract_first()
if organiz_text:
item["organiz_text"] = organiz_text
else:
item["organiz_text"] = ""
organiz_url = response.xpath("//div[@class="col-xs-12"]/div[@class="col-xs-7"][1]/a/@href").extract_first()
if organiz_url:
item["organiz_url"] = "https://www.openhub.net" + organiz_url
else:
item["organiz_url"] = ""
language = response.xpath("//table[@class="table table-striped unstyled pull-left language_table"]//tr").xpath("string(.)").extract()
if language:
item["language"] = " ".join(language).replace("\n","")
else:
item["language"] = ""
user_rate = response.xpath("//div[@id="community_rating"]/div[1]/div[1]/text()").extract_first()
if user_rate:
item["user_rate"] = "".join(user_rate).strip("\n").strip(" users rate this project:")
else:
item["user_rate"] = ""
rating = response.xpath("//div[@id="community_rating"]/div[1]/div[2]/div/div/div/text()").extract_first()
if rating:
item["rating"] = "".join(rating).strip("\n")
else:
item["rating"] = ""
link = response.xpath("//div[@class="col-xs-12"]/div[@class="col-xs-5 text-right text-right"]/a/@href").extract_first()
link_url = "https://www.openhub.net" + link
a_list = response.xpath("//div[@class="well"]/div[@class="col-xs-12"]/div[@class="col-xs-5 text-right"]/a/@href").extract_first()
-sharp print(a_list)
if a_list:
code_link = "https://www.openhub.net" + a_list
-sharp print(code_link)
yield scrapy.Request(url=code_link,callback=self.parse_detail_two2,meta={"item": item})
yield scrapy.Request(url=link_url,callback=self.parse_detail_two,meta={"item":item})
def parse_detail_two2(self, response):
item = response.meta["item"]
print("***" * 10)
item["a"] = 0
tr_list = response.xpath("//table[@class="table table-striped"]/tbody/tr")
for otr in tr_list:
item["code_url"] = otr.xpath(".//td[@class="col-md-4"]/text()").extract_first().strip("\n")
item["scmtype"] = otr.xpath(".//td[@class="col-md-2"]/text()").extract_first().strip("\n")
item["update_status"] = otr.xpath(".//td[@class="col-md-3"]/text()").extract_first().strip("\n")
yield item
-sharp
def parse_detail_two(self, response):
item = response.meta["item"]
well_list = response.xpath("//div[@class="well"]")
for owell in well_list:
html = etree.HTML(owell.extract())
if html.xpath(r"//div[@class="col-md-3 margin_bottom_25"]"):
type = html.xpath("//h3[@class="strong pull-left margin_bottom_15 margin_top_0"]/text()")
name = html.xpath("//div[@class="col-md-3 margin_bottom_25"]/text()")
url = html.xpath("//div[@class="col-md-6"]/h5/a/@href")
item["type"] = "".join(type).strip("\n")
item["name"] = "".join(name).strip("\n")
item["url"] = "".join(url).strip("\n")
item["a"] = 1
yield item
item["a"] = 2
yield item