need to crawl the following website https://isisn.nsfc.gov.cn/egr.
for the purpose of search, but this page is dynamically loaded through ajax, and the content of cookies,post needs to contain CAPTCHA, which is updated every second. How to crawl such a web page?
provide an example of query here:
Project Code: F030203
funding Category: surface projects
approval year: 2017
post page
post
Source code
-sharp!/usr/bin/env python
-sharp -*- coding: utf-8 -*-
-sharp @Time : 2018/4/15 18:12
import requests,json,zlib,gzip,re
with open("curl.txt") as f:
para = f.read()
s = requests.session()
url = "https://isisn.nsfc.gov.cn/egrantindex/funcindex/prjsearch-list?flag=grid&checkcode="
headers = {
"Origin": "https://isisn.nsfc.gov.cn",
-sharp "Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
"Content-Type": "application/x-www-form-urlencoded",
"Accept": "application/xml, text/xml, */*; q=0.01",
"Referer": "https://isisn.nsfc.gov.cn/egrantindex/funcindex/prjsearch-list",
"X-Requested-With": "XMLHttpRequest",
"Connection": "keep-alive",
-sharp "Cookie": "THFqhTnW0hPXnGjMZxctP5lYgKqRyDyDspJ20mjQJ8T12MG5JpxY!330819558!-2052098913; test=69345741; isisn=98184645; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=zh_CN; JSESSIONID=Zd1uNLn4tg6QFEWhXZ6Hc8e0ldqtAwWS0NN5mmerlfSyLVoYJe5T!1578882446"
}
cookies = {"sessionidindex":"Nhd1hT2D2bLsDX0fbYPH6gGbpNvFGhG177Dr3BksGFj1MB11czXc!-877234612!180665615",-sharp-sharp-sharp
"test":"69345741",
"isisn":"98184645",
"org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE":"zh_CN",
"JSESSIONID":"Zd1uNLn4tg6QFEWhXZ6Hc8e0ldqtAwWS0NN5mmerlfSyLVoYJe5T!1578882446"}
cookies["sessionidindex"] = re.findall("sessionidindex=(.*?);",para)[0]
-sharp data = {"_search":"false",
-sharp "nd":"1523792584670",-sharp-sharp-sharp-sharp-sharp-sharp-sharp
-sharp "rows":10,
-sharp "page":"1",
-sharp "sidx":"",
-sharp "sord":"desc",
-sharp "searchString":"resultDate%5E%3AprjNo%253A%252Cctitle%253A%252CpsnName%253A%252CorgName%253A%252CsubjectCode%253AF030203.%25E5%25A4%258D%25E6%259D%2582%25E7%25B3%25BB%25E7%25BB%259F%25E5%258F%258A%25E5%25A4%258D%25E6%259D%2582%25E7%25BD%2591%25E7%25BB%259C%25E7%2590%2586%25E8%25AE%25BA%25E4%25B8%258E%25E6%2596%25B9%25E6%25B3%2595%252Cf_subjectCode_hideId%253AF030203%252CsubjectCode_hideName%253AF030203.%25E5%25A4%258D%25E6%259D%2582%25E7%25B3%25BB%25E7%25BB%259F%25E5%258F%258A%25E5%25A4%258D%25E6%259D%2582%25E7%25BD%2591%25E7%25BB%259C%25E7%2590%2586%25E8%25AE%25BA%25E4%25B8%258E%25E6%2596%25B9%25E6%25B3%2595%252CkeyWords%253A%252Ccheckcode%253A837c%252CgrantCode%253A218%252CsubGrantCode%253A%252ChelpGrantCode%253A%252Cyear%253A2005%252Csqdm%253AF030203%5Btear%5Dsort_name1%5E%3ApsnName%5Btear%5Dsort_name2%5E%3AprjNo%5Btear%5Dsort_order%5E%3Adesc"
-sharp }
-sharp
-sharp data["nd"] = re.findall("nd=(.*?)&",para)[0]
-sharp data["searchString"] = re.findall("searchString=(.*?)\"",para)[0]
data = re.findall("--data \"(.*?)\"",para)[0]
print(cookies["sessionidindex"])
print(data)
-sharp print(data["nd"])
-sharp print(data["searchString"])
data = re.sub("year%253A2005","year%253A{}",data)
for year in range(2005,2017):
r = requests.post(url,data=data.format(year),headers=headers)
print(r.text)