Why do these url jump back to the selenium of middleware via selenium jump to the url request crawled down the page in scrapy, instead of calling back to the following def
def parse(self, response):
contents = response.xpath("//*[@id="tbodyID"]/tr")
for content in contents:
watch_url = "http://mooc1-1.chaoxing.com" + str(content.xpath("td[2]/a/@href").extract_first())
yield scrapy.Request(url=watch_url, callback=self.parse_detail, dont_filter=True)
def parse_detail(self, response):
alls = response.xpath("/html/body/div[3]/div/div[5]/table/tbody/tr")
for all in alls:
item = ChaoxinItem()
item["start_time"] = all.xpath("td[1]/text()").extract_first()
item["watch_time"] = all.xpath("td[2]/text()").extarct_first()
item["computer"] = all.xpath("td[3]/text()").extarct_first()
yield item
class JSPageMiddleware (object):
-sharpchrome
def process_request(self, request, spider):
if spider.name == "hxxy":
-sharp browser = webdriver.Chrome(executable_path="D:/Chromedriver/chromedriver.exe")
spider.browser.get(request.url)
time.sleep(10)
spider.browser.find_element_by_xpath("/html/body/div[2]/div[1]/div[1]/div[1]/div/a").click()
spider.browser.switch_to_window(spider.browser.window_handles[1])
time.sleep(1)
spider.browser.switch_to_frame("frame_content")
spider.browser.find_element_by_xpath("/html/body/div/div/div[2]/div[2]/ul/li[1]/div[1]/a").click()
spider.browser.switch_to_window(spider.browser.window_handles[2])
spider.browser.find_element_by_xpath("//*[@id="s3"]/div/ul/li[3]/a").click()
time.sleep(1)
spider.browser.find_element_by_xpath("//*[@id="left"]/div[4]/div[3]/div[1]/div[1]/div/h3/a/span/span[2]").click()
time.sleep(1)
spider.browser.switch_to_window(spider.browser.window_handles[3])
spider.browser.find_element_by_xpath("/html/body/div[3]/div/div[2]/div/table/tbody/tr[1]/td[6]/a").click()
time.sleep(1)
for i in range(7):
button = spider.browser.find_element_by_xpath("//*[@id="moreButton"]")
button.click()
time.sleep(1)
return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8", request=request)