problem description
I downloaded several scrapy projects from GitHub and put them into my own directory for execution, but I got an error.
miscpip
window7
Python3.7
scrapy 1.5.1
related codes
/ / Please paste the code text below (do not replace the code with pictures)
settings.py
-sharp Scrapy settings for douyu project
-sharp
-sharp For simplicity, this file contains only the most important settings by
-sharp default. All the other settings are documented here:
-sharp
-sharp http://doc.scrapy.org/en/latest/topics/settings.html
-sharp
import sys
import os
from os.path import dirname
path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
sys.path.append(path)
from misc.log import *
BOT_NAME = "douyu"
SPIDER_MODULES = ["douyu.spiders"]
NEWSPIDER_MODULE = "douyu.spiders"
-sharp Crawl responsibly by identifying yourself (and your website) on the user-agent
-sharpUSER_AGENT = "douyu (+http://www.yourdomain.com)"
DOWNLOADER_MIDDLEWARES = {
-sharp "misc.middleware.CustomHttpProxyMiddleware": 400,
"misc.middleware.CustomUserAgentMiddleware": 401,
}
ITEM_PIPELINES = {
"douyu.pipelines.JsonWithEncodingPipeline": 300,
-sharp"douyu.pipelines.RedisPipeline": 301,
}
LOG_LEVEL = "INFO"
DOWNLOAD_DELAY = 1
spider.py
import re
import json
from urlparse import urlparse
import urllib
import pdb
from scrapy.selector import Selector
try:
from scrapy.spiders import Spider
except:
from scrapy.spiders import BaseSpider as Spider
from scrapy.utils.response import get_base_url
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor as sle
from douyu.items import *
from misc.log import *
from misc.spider import CommonSpider
class douyuSpider(CommonSpider):
name = "douyu"
allowed_domains = ["douyu.com"]
start_urls = [
"http://www.douyu.com/directory/all"
]
rules = [
Rule(sle(allow=("http://www.douyu.com/directory/all")), callback="parse_1", follow=True),
]
list_css_rules = {
"-sharplive-list-contentbox li": {
"url": "a::attr(href)",
"room_name": "a::attr(title)",
"tag": "span.tag.ellipsis::text",
"people_count": ".dy-num.fr::text"
}
}
list_css_rules_for_item = {
"-sharplive-list-contentbox li": {
"__use": "1",
"__list": "1",
"url": "a::attr(href)",
"room_name": "a::attr(title)",
"tag": "span.tag.ellipsis::text",
"people_count": ".dy-num.fr::text"
}
}
def parse_1(self, response):
info("Parse "+response.url)
-sharpx = self.parse_with_rules(response, self.list_css_rules, dict)
x = self.parse_with_rules(response, self.list_css_rules_for_item, douyuItem)
print(len(x))
-sharp print(json.dumps(x, ensure_ascii=False, indent=2))
-sharp pp.pprint(x)
-sharp return self.parse_with_rules(response, self.list_css_rules, douyuItem)
return x
pipelines.py
-sharp Define your item pipelines here
-sharp
-sharp Don"t forget to add your pipeline to the ITEM_PIPELINES setting
-sharp See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import redis
from scrapy import signals
import json
import codecs
from collections import OrderedDict
class JsonWithEncodingPipeline(object):
def __init__(self):
self.file = codecs.open("data_utf8.json", "w", encoding="utf-8")
def process_item(self, item, spider):
line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
self.file.write(line)
return item
def close_spider(self, spider):
self.file.close()
class RedisPipeline(object):
def __init__(self):
self.r = redis.StrictRedis(host="localhost", port=6379)
def process_item(self, item, spider):
if not item["id"]:
print "no id item!!"
str_recorded_item = self.r.get(item["id"])
final_item = None
if str_recorded_item is None:
final_item = item
else:
ritem = eval(self.r.get(item["id"]))
final_item = dict(item.items() + ritem.items())
self.r.set(item["id"], final_item)
def close_spider(self, spider):
return
items.py
-sharp Define here the models for your scraped items
-sharp
-sharp See documentation in:
-sharp http://doc.scrapy.org/en/latest/topics/items.html
from scrapy.item import Item, Field
class douyuItem(Item):
-sharp define the fields for your item here like:
url = Field()
room_name = Field()
people_count = Field()
tag = Field()
what result do you expect? What is the error message actually seen?
I hope I can give a special no answer.