from Espider.pipelines.mongodbpipeline import mongodb_pipeline
from scrapy.exceptions import DropItem
import requests,os
import hashlib
class searchwebsitepipeline(mongodb_pipeline):
def __int__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}
def get_max_size_url(self,url_list):
urlSize=[]
index=[]
for url in url_list:
res = requests.get(url, headers=self.headers)
save_path = os.path.join("./image", hashlib.sha1(res.contet))
with open(save_path, "wb") as code:
code.write(save_path)
urlSize.append({os.path.getsize(save_path):url})
for data in urlSize:
index.append(list(data.keys())[0])
for data in urlSize:
if list(data.keys())[0]==max(index):
return list(data.values())[0]
def process_item(self, item, spider):
print("mongodb ")
pipeline headers
018-11-26 03:42:36 [scrapy.core.scraper] ERROR: Error processing {"androidUpProductAbstract": "",
"androidUpProductDetailType": 2,
"androidUpProductLink": "",
"androidUpProductName": "",
"businessName": "",
"iconUrl": ["1.ico", "1.ico"],
"iosUpProductAbstract": "",
"iosUpProductDetailType": 1,
"iosUpProductLink": "",
"iosUpProductName": ""}
Traceback (most recent call last):
File "/home/shenjianlin/.local/lib/python3.4/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/home/shenjianlin/my_project/Espider/Espider/pipelines/searchwebsitepipeline.py", line 37, in process_item
iconUrl=self.get_max_size_url(item["iconUrl"])
File "/home/shenjianlin/my_project/Espider/Espider/pipelines/searchwebsitepipeline.py", line 17, in get_max_size_url
res = requests.get(url, headers=self.headers)
AttributeError: "searchwebsitepipeline" object has no attribute "headers"
print links