Use scrapy to climb a website with more than 47000 pages, obviously did not finish climbing, the result ended every two or three hours, showing finish. But I didn't finish the climb.

< H1 > attach the source code of the crawler file. < / H1 >
import scrapy
from openhub.items import OpenhubItem
from lxml import etree
import json

class ProjectSpider(scrapy.Spider):
    name = "project"
    -sharp allowed_domains = []
    start_urls = ["https://www.openhub.net/p?page=1&query="]
    url = "https://www.openhub.net/p?page={}&query="
    page = 1
    def parse(self, response):
        div_list = response.xpath("//div[@class="well searchable"]")
        pid = 0
        for odiv in div_list:
            -sharpitem
            item = OpenhubItem()
            -sharp 
            post = odiv.xpath(".//div/a[@class="pull-left logo"]/img/@src").extract_first()
            pid = (pid+1)+(self.page*10)-10
            item["pid"] = pid
            if post:
                item["post"] = post
            else:
                item["post"] = ""
            licenses = odiv.xpath(".//div[@class="licenses pull-right"]").xpath("string(.)").extract()
            lic = ",".join(licenses).replace("\n","").strip("Licenses:")
            if lic:
                item["licenses"] = lic
            else:
                item["licenses"] = "No declared licenses"
            href = odiv.xpath(".//div/a[@class="pull-left logo"]/@href").extract_first()
            item["openhub_url"] = "https://www.openhub.net" + href
            time = odiv.xpath(".//div[@id="inner_content"]/div/p/i/abbr/@title").extract_first()
            if time:
                item["time"] = time
            else:
                item["time"] = ""
            detail_url = item["openhub_url"]
            lt = []
            -sharp 
            with open("time222.txt", "r", encoding="utf-8") as f:
                is_Existed = False
                need_update = False
                for lines in f.readlines():
                    line = lines.strip("\n")
                    line = json.loads(line)
                    lt.append(line)
                    -sharp if line.get("openhub_url") == item["url"]:
                    if item["openhub_url"] == line.get("openhub_url"):
                        is_Existed = True
                        if item["time"] != line.get("time"):
                            need_update = True
                if (is_Existed == False) or (need_update == True):
                    yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item})
        if self.page <= 47517:
            self.page += 1
            url = self.url.format(self.page)
            yield scrapy.Request(url=url, callback=self.parse)
    -sharp
    def parse_detail(self, response):
        -sharpresponsemetaitem
        item = response.meta["item"]
        item["title"] = response.xpath("//div[@id="project_header"]/div[1]/h1/a/text()").extract_first()
        summary = response.xpath("//section[@id="project_summary"]").xpath("string(.)").extract_first().strip(" \n").replace("\n","")
        if summary:
            item["summary"] = summary
        else:
            item["summary"] = ""
        score = response.xpath("//*[@id="i_use_this_container"]/div/a/text()").extract_first()
        if score:
            item["score"] = score
        else:
            item["score"] = ""
        tags = response.xpath("//p[@class="tags"]").xpath("string(.)").extract_first().strip("\n")
        if tags:
            item["tags"] = tags
        else:
            item["tags"] = ""
        info = response.xpath("//div[@class="well"]/ul").xpath("string(.)").extract_first().strip("\n").replace("\n","")
        if info:
            item["info"] = info
        else:
            item["info"] = ""
        organiz_text = response.xpath("//div[@class="col-xs-12"]/div[@class="col-xs-7"][1]/a/text()").extract_first()
        if organiz_text:
            item["organiz_text"] = organiz_text
        else:
            item["organiz_text"] = ""
        organiz_url = response.xpath("//div[@class="col-xs-12"]/div[@class="col-xs-7"][1]/a/@href").extract_first()
        if organiz_url:
            item["organiz_url"] = "https://www.openhub.net" + organiz_url
        else:
            item["organiz_url"] = ""
        language = response.xpath("//table[@class="table table-striped unstyled pull-left language_table"]//tr").xpath("string(.)").extract()
        if language:
            item["language"] = " ".join(language).replace("\n","")
        else:
            item["language"] = ""

        user_rate = response.xpath("//div[@id="community_rating"]/div[1]/div[1]/text()").extract_first()
        if user_rate:
            item["user_rate"] = "".join(user_rate).strip("\n").strip(" users rate this project:")
        else:
            item["user_rate"] = ""

        rating = response.xpath("//div[@id="community_rating"]/div[1]/div[2]/div/div/div/text()").extract_first()
        if rating:
            item["rating"] = "".join(rating).strip("\n")
        else:
            item["rating"] = ""

        link = response.xpath("//div[@class="col-xs-12"]/div[@class="col-xs-5 text-right text-right"]/a/@href").extract_first()
        link_url = "https://www.openhub.net" + link

        a_list = response.xpath("//div[@class="well"]/div[@class="col-xs-12"]/div[@class="col-xs-5 text-right"]/a/@href").extract_first()
        -sharp print(a_list)
        if a_list:
            code_link = "https://www.openhub.net" + a_list
        -sharp print(code_link)
            yield scrapy.Request(url=code_link,callback=self.parse_detail_two2,meta={"item": item})
        yield scrapy.Request(url=link_url,callback=self.parse_detail_two,meta={"item":item})

    def parse_detail_two2(self, response):
        item = response.meta["item"]
        print("***" * 10)
        item["a"] = 0
        tr_list = response.xpath("//table[@class="table table-striped"]/tbody/tr")
        for otr in tr_list:
            item["code_url"] = otr.xpath(".//td[@class="col-md-4"]/text()").extract_first().strip("\n")
            item["scmtype"] = otr.xpath(".//td[@class="col-md-2"]/text()").extract_first().strip("\n")
            item["update_status"] = otr.xpath(".//td[@class="col-md-3"]/text()").extract_first().strip("\n")
            yield item

    -sharp 
    def parse_detail_two(self, response):
        item = response.meta["item"]
        well_list = response.xpath("//div[@class="well"]")
        for owell in well_list:
            html = etree.HTML(owell.extract())
            if html.xpath(r"//div[@class="col-md-3 margin_bottom_25"]"):
                type = html.xpath("//h3[@class="strong pull-left margin_bottom_15 margin_top_0"]/text()")
                name = html.xpath("//div[@class="col-md-3 margin_bottom_25"]/text()")
                url = html.xpath("//div[@class="col-md-6"]/h5/a/@href")
                item["type"] = "".join(type).strip("\n")
                item["name"] = "".join(name).strip("\n")
                item["url"] = "".join(url).strip("\n")
                item["a"] = 1
                yield item
        item["a"] = 2
        yield item
May.25,2022

send out the statistical information of the last run. Is the website blocked

?
Menu