as the title: if you want to climb the gourmet pictures on the gourmet website, take the name of the cuisine as the entrance, climb the delicacies of each cuisine and put it in a folder named after the cuisine with the name of the food as the picture name. If you crawl directly and do not make any settings in pipeline, the picture can be downloaded directly, but when you set pipeline, you cannot download the image, warning: Item contains no Images. The
code is as follows:
crawler file meishijie.py:
import scrapy
import re
from ..items import ImgItem
class MeishijieSpider(scrapy.Spider):
name = "meishijie"
allowed_domains = ["meishij.net"]
start_urls = ["https://www.meishij.net/china-food/caixi/"]
def parse(self, response):
cuisine_list=response.xpath("//dl[@class="listnav_dl_style1 w990 clearfix"]//dd/a/@href").extract()
-sharp extract the link of each cuisine
-sharpprint(len(link_list)) -sharp the amount of the cuicines
for cuisine_url in cuisine_list:
-sharpprint(cuisine_url)
yield scrapy.Request(cuisine_url,callback=self.parse_cuisine_img)
def parse_cuisine_img(self,response):
item=ImgItem()
item["image_urls"] = response.xpath("//img[@class="img"]//@src").extract()
item["image_names"] = response.xpath("//div[@class="c1"]//strong//text()").extract()
-sharpitem["vc"] = response.xpath("//div[@class="c1"]//span//text()").extract()
-sharpprint(len(item["image_urls"]))
-sharp get the url of the next page
next_link=response.xpath("//div[@class="listtyle1_page"]//a[@class="next"]//@href").extract()
split_name=re.split("/",next_link[0])
cuisine=split_name[-2] -sharp get the name of each cuisine
item["cuisine_names"]=cuisine
-sharpprint(item["cuisine_names"])
-sharpprint(item["image_names"])
-sharpprint(item["image_urls"])
-sharpprint(item["cuisine_names"])
yield item
if next_link:
next_link = next_link[0]
-sharpprint(next_link)
yield scrapy.Request(next_link,callback=self.parse_cuisine_img)
pipeline file:
import re
import os
import urllib
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
import json
import codecs
import shutil
class JsonWithEncodingPipeline(object):
def __init__(self):
self.file=codecs.open("meishijie.json","w",encoding="utf-8")
def process_item(self,item,spider):
line = json.dumps(dict(item),ensure_ascii=False) + "\n"
self.file.write(line)
return item
def spider_closed(self,spider):
self.file.close()
class ImgPipeline(ImagesPipeline):
def get_media_request(self,item,info):
for img_url in item["image_urls"]:
yield Request(img_url,meta={"item":item,"index":item["image_urls"].index(img_url)})
def item_completed(self,results,item,info):
image_paths = [x["path"] for ok,x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
-sharpitem["image_paths"] = image_paths
return item
def file_path(self, request, response=None, info=None):
item = request.meta["item"] -sharp metaitem
index = request.meta["index"] -sharp index
folder_name=item["cuisine_names"]
image_guid = item["image_names"][index]
filename = u"full/{0}/{1}".format(folder_name,image_guid)
return filename
settings:
BOT_NAME = "img"
SPIDER_MODULES = ["img.spiders"]
NEWSPIDER_MODULE = "img.spiders"
ITEM_PIPELINES = {
"img.pipelines.ImgPipeline": 1,
"img.pipelines.JsonWithEncodingPipeline":2,
-sharp"img.pipelines.ImgPipeline":1
}
-sharp
IMAGES_STORE=".../food-image/pic"
items.py
import scrapy
class ImgItem(scrapy.Item):
-sharp define the fields for your item here like:
-sharp name = scrapy.Field()
cuisine_names = scrapy.Field()
image_names = scrapy.Field()
image_urls = scrapy.Field()
-sharpvc = scrapy.Field()
-sharpimages=scrapy.Field()
Last run:
scrapy crawl meishijie
which friend can take a look at it? I would appreciate it!
complete code github link: https://github.com/QingZhang1.