problem description
cannot put an atlas in the same directory while downloading http://www.umei.cc/p/gaoqing/..
the environmental background of the problems and what methods you have tried
tried a lot of methods on the Internet, but could not solve
related codes
/ / Please paste the code text below (do not replace the code with pictures)
-sharpcoding:utf-8
import random
import re
import urllib2
from urllib import urlopen
import requests
import logging
import time
from bs4 import BeautifulSoup,Comment
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from z2.items import Z2Item
from scrapy.http import Request
logging.basicConfig(
level=logging.INFO,
format=
"%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s",
datefmt="%a, %d %b %Y %H:%M:%S",
filename="cataline.log",
filemode="w")
class Spider(CrawlSpider):
name = "z2"
img_urls = []
allowed_domains = ["www.umei.cc"]
start_urls = ["http://www.umei.cc/p/gaoqing/rihan/"]
-sharp rules = (
-sharp Rule(LinkExtractor(allow=("http://www.umei.cc/p/gaoqing/rihan/\d{1,6}.htm",), deny=("http://www.umei.cc/p/gaoqing/rihan/\d{1,6}_\d{1,6}.htm")),
-sharp callback="parse_z2_info", follow=True),
-sharp )
def start_requests(self):
yield Request(url="http://www.umei.cc/p/gaoqing/rihan/",
callback=self.parse_z2_key)
def parse_z2_key(self, response):
soup = BeautifulSoup(response.body, "lxml")
content = soup.find("div", attrs={"class": "TypeList"})
-sharp logging.debug(content)
for link in content.findAll("a", attrs={"href": re.compile( r"(.*)(/rihan/)(\d{1,6})(.htm)"), "class": "TypeBigPics"}):
logging.debug(link["href"])
yield Request(url=link["href"],
callback=self.parse_z2_info)
break
def parse_z2_info(self, response):
soup = BeautifulSoup(response.body, "lxml")
item = Z2Item()
-sharp html
for element in soup(text=lambda text: isinstance(text, Comment)):
element.extract()
-sharp script
[s.extract() for s in soup("script")]
-sharp b
[s.extract() for s in soup("b")]
ArticleDesc = soup.find("p", attrs={"class": "ArticleDesc"})
logging.debug(ArticleDesc.get_text())
Pages = soup.find("div", attrs={"class": "NewPages"}).find("li")
pageCounts = filter(str.isdigit, Pages.get_text().encode("gbk"))
-sharp
-sharp logging.debug(re.findall(r"\d+\.?\d*", Pages.get_text())[0])
-sharp
-sharp logging.debug(Pages.get_text()[1:-3])
-sharp
logging.debug(filter(str.isdigit, Pages.get_text().encode("gbk")))
-sharp img = soup.find("div", attrs={"class": "ImageBody"}).find("img")
-sharp url = img.attrs["src"]
-sharp self.img_urls.append(url)
-sharp logging.debug(self.img_urls)
item["name"] = re.match(".*/(\d+)", response.url).group(1)
logging.debug(item["name"])
-sharp image_urls = []
-sharp item["image_urls"] = image_urls
sourceUrl = response.url[0:-4]
-sharp logging.debug(sourceUrl)
for i in xrange(1, int(pageCounts) + 1):
nextUrl = sourceUrl + "_" + str(i) + ".htm"
-sharp logging.debug(nextUrl)
yield Request(url=nextUrl,callback=self.parse_z2_single_img)
item["image_urls"] = self.img_urls
yield item
def parse_z2_single_img(self, response):
soup = BeautifulSoup(response.body, "lxml")
img = soup.find("div", attrs={"class": "ImageBody"}).find("img")
url = img.attrs["src"]
self.img_urls.append(url)