I"m going to crawl the paper data on Web of science based on the author"s name, and submit the author"s name and sid, result using the post method. r.text returns correct, and can also match the required information . But! I"m going to get the url and the number of pages, and then I can crawl more pages further, but the output r.url doesn"t return the correct URL ! But obviously r.text returned to the right ah! This is so weird!
has been plagued by this problem for several days! Turn to all the gods for rescue!
import re
-sharp from threading import Thread
from multiprocessing import Process
from multiprocessing import Manager
import requests
import time
import xlrd
from bs4 import BeautifulSoup
from lxml import etree
import pandas as pd
from pandas import DataFrame
authors = ["HUANG J X"]
root = "http://apps.webofknowledge.com"
s = requests.get(root)
sid = re.findall(r"SID=\w+&", s.url)[0].replace("SID=", "").replace("&", "")
root_url = "http://apps.webofknowledge.com/UA_GeneralSearch.do"
count = 0
basic_urls = []
pages = []
for author in authors:
print author
time.sleep(1)
count += 1
if count % 100 ==0:
s = requests.get(root)
sid = re.findall(r"SID=\w+&", s.url)[0].replace("SID=", "").replace("&", "")
"""headers = {
"Origin": "http://apps.webofknowledge.com",
"Referer": "http://apps.webofknowledge.com/UA_GeneralSearch_input.do?product=UA&search_mode=GeneralSearch&SID=5B3frlTpw4JHeQwYMgR&preferencesSaved=",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
"Content-Type": "application/x-www-form-urlencoded"
}"""
headers = {
"Origin": "http://apps.webofknowledge.com",
"Referer": "http://apps.webofknowledge.com/Search.do?product=UA&SID=6E6M5icjM8KCcO9BZx4&search_mode=GeneralSearch&preferencesSaved=",
"Content-Type": "application/x-www-form-urlencoded",
"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
}
form_data = {
"fieldCount": 1,
"action": "search",
"product": "UA",
"search_mode": "GeneralSearch",
"SID": sid,
"max_field_count": 25,
"formUpdated": "true",
"value(input1)": author,
"value(select1)": "AU",
"value(hidInput1)": "",
"limitStatus": "collapsed",
"ss_lemmatization": "On",
"ss_spellchecking": "Suggest",
"SinceLastVisit_UTC": "",
"SinceLastVisit_DATE": "",
"period": "Range Selection",
"range": "ALL",
"startYear": "1900",
"endYear": "2018",
"update_back2search_link_param": "yes",
"ssStatus": "display:none",
"ss_showsuggestions": "ON",
"ss_query_language": "auto",
"ss_numDefaultGeneralSearchFields": 1,
"rs_sort_by": "PY.D;LD.D;SO.A;VL.D;PG.A;AU.A"
}
s = requests.Session()
r = s.post(root_url, data=form_data, headers=headers,timeout=30)
-sharp
print r.text
r.encoding = r.apparent_encoding
tree = etree.HTML(r.text)
basic_url = r.url
if basic_url.strip()=="":
basic_url = u""
else:
basic_url = r.url
page = tree.xpath("//span[@id="pageCount.top"]/text()")
if len(page)==0:
page = u""
else:
page = tree.xpath("//span[@id="pageCount.top"]/text()")[0]
-sharppager.url
print page,basic_url
basic_urls.append(basic_url)
pages.append(page)