recently learn crawler, use selenium to crawl the home page picture of the website, test to get the url, of the picture, but always report the following error, can not find the reason, hope the hero to help!
the code is as follows:
item.py
import scrapy
class JiandanItem(scrapy.Item):
-sharp define the fields for your item here like:
-sharp name = scrapy.Field()
image_urls = scrapy.Field()-sharp
images = scrapy.Field()
pipeline.py
import os
import urllib
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
from PIL import Image
from jiandan import settings
class JiandanPipeline(ImagesPipeline)
def get_media_requests(self, item, info):
for image_url in item["image_urls"]:
print(image_url)
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x["path"] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
return item
middlewares.py
import scrapy
from selenium import webdriver
from scrapy.http import HtmlResponse
import time
class PageMiddleware(object):
def process_request(self, request, spider):
if request.meta.has_key("PhantomJS"):
return
else:
driver = webdriver.PhantomJS()
driver.get(request.url)
time.sleep(1)
content = driver.page_source.encode("utf-8")
-sharp print(content)
driver.quit()
return HtmlResponse(request.url, encoding="utf-8",body=content, request=request)
jiandanSpider.py
import scrapy
from jiandan.items import JiandanItem
from scrapy.crawler import CrawlerProcess
class jiandanSpider(scrapy.Spider):
name = "jiandan"
allowed_domains = ["http://www.172mn.com/"]
start_urls = ["http://www.172mn.com/"]
def parse(self, response):
item = JiandanItem()
item["image_urls"] = response.xpath("//li//img/@src").extract()
yield item
settings.py
BOT_NAME = "jiandan"
SPIDER_MODULES = ["jiandan.spiders"]
NEWSPIDER_MODULE = "jiandan.spiders"
HTTPERROR_ALLOWED_CODES = [403]
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.10 Safari/537.36",
"Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language" : "zh-CN,zh;q=0.9",
}
DOWNLOADER_MIDDLEWARES = {
"jiandan.middlewares.PageMiddleware": 543,
}
ITEM_PIPELINES = {
"jiandan.pipelines.JiandanPipeline": 1,
}
IMAGES_STORE = "/home/python/Desktop/"
DOWNLOAD_DELAY = 3
IMAGES_THUMBS = {
"small": (50, 50),
"big": (200, 200),
}
error message:
https://img.codeshelper.com/upload/img/2021/03/13/e1b20hwhbgg7616.jpg
2018-05-21 22:33:58 [selenium.webdriver.remote.remote_connection] DEBUG: POST http://127.0.0.1:34265/wd/hub/session {"desiredCapabilities": {"platform": "ANY", "browserName": "phantomjs", "version": "", "javascriptEnabled": true}}
2018-05-21 22:33:58 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-05-21 22:33:58 [selenium.webdriver.remote.remote_connection] DEBUG: POST http://127.0.0.1:34265/wd/hub/session/ff87b910-5d03-11e8-8234-c5368f3096da/url {"url": "https://img.codeshelper.com/upload/img/2021/03/13/e1b20hwhbgg7616.jpg", "sessionId": "ff87b910-5d03-11e8-8234-c5368f3096da"}
2018-05-21 22:34:00 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-05-21 22:34:01 [selenium.webdriver.remote.remote_connection] DEBUG: GET http://127.0.0.1:34265/wd/hub/session/ff87b910-5d03-11e8-8234-c5368f3096da/source {"sessionId": "ff87b910-5d03-11e8-8234-c5368f3096da"}
2018-05-21 22:34:01 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-05-21 22:34:01 [selenium.webdriver.remote.remote_connection] DEBUG: DELETE http://127.0.0.1:34265/wd/hub/session/ff87b910-5d03-11e8-8234-c5368f3096da {"sessionId": "ff87b910-5d03-11e8-8234-c5368f3096da"}
2018-05-21 22:34:01 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-05-21 22:34:01 [scrapy] DEBUG: Crawled (200) <GET https://img.codeshelper.com/upload/img/2021/03/13/e1b20hwhbgg7616.jpg> (referer: None)
2018-05-21 22:34:01 [scrapy] DEBUG: File (downloaded): Downloaded file from <GET https://img.codeshelper.com/upload/img/2021/03/13/e1b20hwhbgg7616.jpg> referred in <None>
2018-05-21 22:34:01 [scrapy] ERROR: File (unknown-error): Error processing file from <GET https://img.codeshelper.com/upload/img/2021/03/13/e1b20hwhbgg7616.jpg> referred in <None>
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/scrapy/pipelines/files.py", line 355, in media_downloaded
checksum = self.file_downloaded(response, request, info)
File "/usr/local/lib/python2.7/dist-packages/scrapy/pipelines/images.py", line 95, in file_downloaded
return self.image_downloaded(response, request, info)
File "/usr/local/lib/python2.7/dist-packages/scrapy/pipelines/images.py", line 99, in image_downloaded
for path, image, buf in self.get_images(response, request, info):
File "/usr/local/lib/python2.7/dist-packages/scrapy/pipelines/images.py", line 112, in get_images
orig_image = Image.open(BytesIO(response.body))
File "/usr/local/lib/python2.7/dist-packages/PIL/Image.py", line 2590, in open
% (filename if filename else fp))
IOError: cannot identify image file <cStringIO.StringI object at 0x7f7c38d2fcf0>