import requests
from config.ip_pool import get_ip
from middlewares import *
import json, random, time
from lxml import etree
from fake_useragent import UserAgent
import yt_common.factory
import re
class lagou ():
def __init__(self):
self.ua = UserAgent()
self.http = yt_common.factory.Factory.get_instance("project")
def get_content(self):
cookies_str = "user_trace_token=20180909010719-4eb82332-59f2-4979-b7ba-4a96de35eb40; _ga=GA1.2.1153938840.1536426437; LGUID=20180909010720-a5755fe0-b389-11e8-8ccd-525400f775ce; _qddaz=QD.wx1cg9.ftx1wj.jnl51m1t; JSESSIONID=ABAAABAAADEAAFIE6475DE07CCCE2D0833999916DC6AED6; utm_source=m_cf_seo_ald_wap; fromsite=""; TG-TRACK-CODE=jobs_similar; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216628753458944-0ec2370b38b4a8-163b6953-1296000-16628753459972%22%2C%22%24device_id%22%3A%2216628753458944-0ec2370b38b4a8-163b6953-1296000-16628753459972%22%2C%22props%22%3A%7B%22%24latest_utm_source%22%3A%22m_cf_cpt_baidu_pc%22%2C%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; index_location_city=%E5%8C%97%E4%BA%AC; WEBTJ-ID=12252018%2C161612-167e46f847741d-04ce0d97f54b0f-163b6953-1296000-167e46f8478f5f; _gid=GA1.2.1277196703.1545725773; X_HTTP_TOKEN=3dec5bde9264a1350e562709684512ea; LG_LOGIN_USER_ID=aa0676d165159370bc5d629d9b5a41215c2b10b329a917bb; _putrc=73B45C3A2AAE9C2E; login=true; unick=%E6%8B%89%E5%8B%BE%E7%94%A8%E6%88%B76572; _gat=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545816032,1545873968,1545878834,1545904413; LGSID=20181227175333-45c3f23b-09bd-11e9-b129-525400f775ce; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fs%3Fwd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26rsv_spt%3D1%26rsv_iqid%3D0xfb7d4ab90001af54%26issp%3D1%26f%3D8%26rsv_bp%3D1%26rsv_idx%3D2%26ie%3Dutf-8%26rqlang%3Dcn%26tn%3Dbaiduhome_pg%26rsv_enter%3D1%26oq%3Dnohup%252520%2525E5%2525A4%252584%2525E7%252590%252586%26rsv_t%3D2a73cVwL843%252Ba5Ai2lBIHgKBBA9Hf58WCmSNIrhGhaXjOjWtQO46%252Fa1hW5BKfpVlE%252BnB%26inputT%3D4637%26rsv_pq%3Dbb8ccaa20001742c%26rsv_sug3%3D70%26rsv_sug1%3D54%26rsv_sug7%3D100%26bs%3Dnohup%2520%25E5%25A4%2584%25E7%2590%2586; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; gate_login_token=fa8999aa6d617649ff083782230eac8ba8c9cc1520ae502f; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1545904416; LGRID=20181227175336-475b23dd-09bd-11e9-ad84-5254005c3644"
headers = {
"User-Agent": self.ua.random,
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language":"en-US,en;q=0.9",
"Cache-Control": "no-cache",
"Pragma": "no-cache",
"Cookie": cookies_str,
"X-Anit-Forge-Code": "0",
"Connection": "keep-alive",
"X-Anit-Forge-Token": "None",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Host": "www.lagou.com"}
header = {"Upgrade-Insecure-Requests": "1", "Host": "www.lagou.com", "User-Agent": self.ua.random,
"Cookie": cookies_str, "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive",
"Cache-Control": "max-age=0", "Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8","Accept-Language": "en-US,en;q=0.9","Cache-Control": "no-cache","Pragma": "no-cache"}
url_list = ["https://www.lagou.com/gongsi/0-2-0-0.json", "https://www.lagou.com/gongsi/0-1-0-0.json",
"https://www.lagou.com/gongsi/0-3-0-0.json"]
for url in url_list:
headers["Referer"] = re.findall("(.*?)\.json", url)[0]
header["Referer"] = re.findall("(.*?)\.json", url)[0]
for i in range(20, 0,-1):
print("%d" % i)
print("%s" % url)
form_data = {"first": "false", "pn": {}, "sortField": "0", "havemark": "0".format(i)}
try:
response = requests.post(url=url, headers=headers, data=form_data,
proxies=get_ip())
time.sleep(random.randint(10, 20))
except Exception as e:
response = requests.post(url=url, headers=headers, data=form_data,
proxies=get_ip())
print(e)
print(response.text)
json_data = json.loads(response.text)
for j in range(0, len(json_data["result"])):
companyId = json_data["result"][j]["companyId"]
companyFullName = json_data["result"][j]["companyFullName"]
companyShortName = json_data["result"][j]["companyShortName"]
companyLogoLink = "https://www.lgstatic.com/thumbnail_300x300/" + str(
json_data["result"][j]["companyLogo"])
companyFeatures = json_data["result"][j]["companyFeatures"]
companyLink = "https://www.lagou.com/gongsi/" + str(json_data["result"][j]["companyId"]) + ".html"
companyCity = json_data["result"][j]["city"]
companySize = json_data["result"][j]["companySize"]
financeStage = json_data["result"][j]["financeStage"]
industryField = json_data["result"][j]["industryField"].replace("", ",")
try:
res = requests.get(companyLink, headers=header,
proxies=get_ip())
time.sleep(random.randint(10, 30))
except Exception as e:
print(e)
res = requests.get(companyLink, headers=header,
proxies=get_ip())
print(companyLink)
print(res.url)
teamInfo = {}
companyLink=etree.HTML(res.text).xpath("//div[@class="company_main"]/h1/a/@href")[0]
print("%s"%companyLink)
companyAddress = etree.HTML(res.text).xpath("//p[@class="mlist_li_desc"]/text()")
companyAddress = [ad.strip() for ad in companyAddress]
name = etree.HTML(res.text).xpath("//p[@class="item_manager_name"]/span/text()")
instro = etree.HTML(res.text).xpath("//div[@class="item_manager_content"]/p/text()|//div[@class="item_manager_content"]/text()")
title = etree.HTML(res.content.decode("utf-8")).xpath("//p[@class="item_manager_title"]/text()")
print(name)
print(instro)
print(title)
for i in range(0, len(name)):
teamInfo.setdefault(str(i), {}).setdefault("name", name[i])
teamInfo.setdefault(str(i), {}).setdefault("title", title[i])
if len(instro) != 0:
teamInfo.setdefault(str(i), {}).setdefault("instro", "".join(instro[i].split()))
else:
teamInfo.setdefault(str(i), {}).setdefault("instro", "")
data = json.dumps({"companyId": companyId, "companyFullName": companyFullName,
"companyShortName": companyShortName, "companyLogoLink": companyLogoLink,
"companyFeature": companyFeatures,
"company_link": companyLink,
"companyCity": companyCity,
"companySize": companySize,
"financeStage": financeStage,
"industryField": industryField,
"companyAddress": companyAddress,
"companyTeam": teamInfo}).encode("utf-8").decode(
"unicode_escape")
lagou_response = self.http.set_post().http_send("/spider/source/save-lagou", {"data": data})
print(lagou_response)
time.sleep(random.randint(3, 5))
print("\n")
if name ="_ _ main__":
pass
this is the code for pull hook net. I always encounter some anti-climbing when I climb the page, such as gateway error 502
.