the environmental background of the problems and what methods you have tried
bs4
Spyder (python3.6)
related codes
< H1 >--coding: utf-8--< / H1 > "
Created on Wed Aug 1 03:07:33 2018
@ author: stephen zheng
"
import requests
from bs4 import BeautifulSoup
import xlwt
import json
import time
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
def get_region_url (html):
-sharp url
soup=BeautifulSoup(html,"lxml")
url_list=[i["href"] for i in soup.find("div",id="region-nav").find_all("a")]
return url_list
def get_content (url,headers = None,proxy=None):
html = requests.get(url,headers=headers).content
return html
def get_url (html):
soup = BeautifulSoup(html,"lxml")
shop_list = soup.find_all("div",class_="tit")
return [i.find("a")["href"] for i in shop_list]
def get_details_content (html):
soup = BeautifulSoup(html,"lxml")
-sharp
price = soup.find("span",id="avgPriceTitle").text
-sharp
evaluation = soup.find("span",id="comment_score").find_all("span",class_="item")
-sharp
the_star = soup.find("div",class_="brief-info").find("span")["title"]
-sharp
title = soup.find("div",class_="shop-name").find("span").text
-sharp
comments = soup.find("span",id="sub-title").text
-sharp
address = soup.find("span",itemprop="street-address").text
print (u":"+title)
for ev in evaluation:
print (ev.text)
print (price)
print (u":"+comments)
print (u":"+address.strip())
print (u":"+the_star)
print ("=======================")
return (title,evaluation[0].text,evaluation[1].text,evaluation[2].text,price,comments,address,the_star)
if name ="_ _ main__":
items = []
start_url = "https://www.dianping.com/search/keyword/1/0_%E7%AF%AE%E7%90%83%E9%A6%86"
base_url = "http://www.dianping.com"
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Cookie":"_hc.v=2f4046e6-6012-4664-6e8b-cdd151ed44e7.1494257443; PHOENIX_ID=0a017918-15c393c3773-116bcd2f;__utma=1.2147215388.1495608855.1495608855.1495622249.2; __utmc=1; __utmz=1.1495608855.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); s_ViewType=10; JSESSIONID=9F1079723C06E82D7555D3373D5DD9B7; aburl=1; cy=2; cye=beijing; __mta=209559469.1495614452018.1495625059663.1495625292763.21"
}
start_html = get_content(start_url)
region_url_list = get_content(start_html)
region_url_list - [base_url+url for url in region_url_list]
for url in region_url_list:
for i in range(1,10):
url_list_page = get_url(get_content(url+"p"+str(i)))
url_list = [base_url+url for url in url_list_page]
for url in url_list:
print (url)
details_html = get_content(url,headers=headers)
item = get_details_content(details_html)
items.append(item)
newTable="DZDP.xls"
wb = xlwt.Workbook(encoding="UTF-8")
ws = wb.add_sheet("test1")
headData = ["","","","","","","",""]
for colnum in range (0pj8):
ws.write(0,colnum,headData(colnum),xlwt.easyxf("font:bold on"))
index = 1
lens = len(items)
for j in range(0,lens):
for i in range(0,8):
print(items[j][i])
ws.write(index,i,items[j][i])
index+=1
wb.save(newTable)-sharp-sharp-sharp
what result do you expect? What is the error message actually seen?
runfile
Traceback (most recent call last):
File "< ipython-input-11-ac3a27dc0ab0 >", line 1, in < module >
runfile("C:/Users/stephen/Desktop/untitled0.py")
File "C:UsersstephenAnaconda3libsite-packagesspyderutilssitesitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "C:UsersstephenAnaconda3libsite-packagesspyderutilssitesitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, "exec"), namespace)
File "C:/Users/stephen/Desktop/untitled0.py", line 64, in < module >
region_url_list = get_content(start_html)
File "C:/Users/stephen/Desktop/untitled0.py", line 23, in get_content
html = requests.get(url,headers=headers).content
File "C:UsersstephenAnaconda3libsite-packagesrequestsapi.py", line 72, in get
return request("get", url, params=params, **kwargs)
File "C:UsersstephenAnaconda3libsite-packagesrequestsapi.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File "C:UsersstephenAnaconda3libsite-packagesrequestssessions.py", line 508, in request
resp = self.send(prep, **send_kwargs)
File "C:UsersstephenAnaconda3libsite-packagesrequestssessions.py", line 612, in send
adapter = self.get_adapter(url=request.url)
File "C:UsersstephenAnaconda3libsite-packagesrequestssessions.py", line 703, in get_adapter
raise InvalidSchema("No connection adapters were found for "%s"" % url)
InvalidSchema: No connection adapters were found for"< html lang= "en" >
< head >
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<title> , , ,,-</title>
<!---->
<link rel="icon" href="//www.dpfile.com/app/pc-common/dp_favicon.ico" type="image/x-icon">
<!---->
<link rel="shortcut icon" href="//www.dpfile.com/app/pc-common/dp_favicon.ico" type="image/x-icon">
<!--1. csscss -->
<link rel="stylesheet" type="text/css" href="//www.dpfile.com/app/pc-common/index.min.css">
<!--css-->
<link rel="stylesheet" href="//www.dpfile.com/app/dpindex-new-static/static/new_404_pc.min.css" type="text/css"/> <!--2. js -->
<!--3. -->
<script type="text/javascript">
window._DP_HeaderData = {
"cityId": "1", // id , : "1"
"cityCName": "", // , , :
"cityEnName": "", // :shanghai
"pageType": "index", // "index" || "search", :"index"
"userId": "", // id "":"",
"userName":"", // "":"",
"dpId": "", // id, :""
"uesrLng": "0.0", // :"0.0";
"userLan": "0.0", // :"0.0";
"clientIp": "" //ip "127.0.0.1"
}
</script>
<!--4. js,:web4.0-->
<script type="text/javascript">
(function (win, doc, ns) { var cacheFunName = "_MeiTuanALogObject"; win[cacheFunName] = ns; if (!win[ns]) { var _LX = function () { _LX.q.push(arguments); return _LX; }; _LX.q = _LX.q || []; _LX.l = +new Date(); win[ns] = _LX; } })(window, document, "LXAnalytics");
</script>
<script src="http://analytics.meituan.net/analytics.js" type="text/javascript" charset="utf-8" async defer></script>
< / head >
< div class= "header-container" >
<div id="top-nav" class="top-nav">
</div>
<div id="logo-input" class="logo-input">
<div class="logo-input-container clearfix">
<a title="" href="/" class="logo"></a>
<div class="search-box">
<div class="search-bar ">
<span class="search-container clearfix">
<i class="i-search"></i>
</span>
<p class="hot-search J-hot-search">
</div>
</div>
<div class="qrcode-container">
<i class="close"></i>
<div class="qrcode">
<p class="qrcode-text">
<div class="qrcode-img"></div>
</div>
</div>
</div>
</div>
<div class="cate-container">
<div class="nav-header">
<div class="navbar">
<a href="//www.dianping.com"><span class="cate-item all-cate"></span></a>
<a target="_blank" class="cate-item other-cate" href="http://t.dianping.com/shanghai"></a>
<a target="_blank" class="cate-item other-cate" href="http://s.dianping.com/event/shanghai"></a>
<i class = "hot-icon"></i>
<a target="_blank" class="cate-item other-cate" href="http://s.dianping.com/shanghai/group?utm_source=dp_pc_other"></a>
</div>
</div>
<div class="gradient"></div>
</div>
< / div >
< div class= "not-found" >
<div class="not-found-content">
<div class="img-not-found"></div>
<div class="not-found-right">
<p class="not-found-words">......
<p class="not-found-words1">:
<p class="not-found-words1"> currentDate:2018-08-01 21:30:44
<p class="not-found-words1"> userIp:36.149.210.62, 10.71.192.26
<p class="not-found-words1"> userAgent:python-requests/2.18.4
<a class="back-to-home" href="http://www.dianping.com">
<button type="button"></button>
</a>
</div>
</div>
< / div >
< div class= "footer-container" >
<div id="channel-footer" class="channel-footer">
<p class="links"> <a target="_blank" href="http://www.dianping.com/help/center/rule?name=about1" rel="nofollow"></a>| <a target="_blank" href="https://dpapp-appeal.meituan.com/-sharp/shopCreditRegulationPC" rel="nofollow"></a>| <a target="_blank" href="//www.dianping.com/help" rel="nofollow"></a>| <a target="_blank" href="http://www.dianping.com/sitemap/c1c10"></a>| <a target="_blank" href="//www.dianping.com/business/" rel="nofollow"></a>| <a target="_blank" href="http://www.dianping.com/help/center/rule?name=media1" rel="nofollow"></a>| <a target="_blank" href="http://careers.dianping.com" rel="nofollow"></a>|
<!--footer links--> <span class="links-container"> <a class="ext-links" href="javascript:void(0);" rel="nofollow"></a>| </span> <a target="_blank" href="http://www.dianping.com/forum" rel="nofollow"></a>| <a target="_blank" href="http://www.dianping.com/help/center/rule?name=about4" rel="nofollow"></a>| <a target="_blank" href="http://developer.dianping.com" rel="nofollow"></a>| <a target="_blank" href="https://developer.meituan.com/?applyFrom=dianping_c_pc_busines" rel="nofollow"></a>
<!-- footer links -->
<div class="ext-container Hide">
<div class="link-items Hide">
<a target="_blank" href="http://www.dianping.com/wedding/wenda"><span></span></a>
<a target="_blank" href="http://www.dianping.com/home/wenda"><span></span></a>
<a target="_blank" href="http://www.dianping.com/home-tuku"><span></span></a>
<a target="_blank" href="//m.dianping.com/home-tuku"><span></span></a>
<a target="_blank" href="http://www.dianping.com/wedding"><span></span></a>
<a target="_blank" href="http://www.dianping.com/plastic/item"><span></span></a>
<a target="_blank" href="http://www.dianping.com/plastic/wenda"><span></span></a>
<a target="_blank" href="http://www.dianping.com/movie"><span></span></a>
<a target="_blank" href="http://www.dianping.com/baby/wenda"><span></span></a>
</div>
</div>
<p class="rights"> <span style="margin-right:10px;">2003-2018 dianping.com, All Rights Reserved.</span> <span> <a rel="nofollow" class="G" href="//www.dianping.com/help/center/rule?name=base2"></a> </span>
</div>
<script> (function(){var h=navigator.userAgent;var i=navigator.appName;var b=i.indexOf("Microsoft Internet Explorer")!==-1;if(!b){return false}var d=/MSIE (\d+).0/g;var e=d.exec(h);if(e&&e.length&&e[1]<9){var j="<div class="browser-overlay"></div><div id="browser-ie-con" class="browser-ie-con"><div id="browser-close" class="close"></div><div class="browser-download chrome"><a href="//www.google.cn/chrome/browser/desktop/index.html?utm_dp" target="_black" title="chrome"></a></div><div class="browser-download firefox"><a href="//www.firefox.com.cn/download/?utm_dp" target="_black" title="firefox"></a></div></div>";var f=document.createElement("div");f.id="browser-update-ie";f.className="browser-update-ie";f.innerHTML=j;document.body.appendChild(f);var a=document.documentElement.clientWidth||document.body.clientWidth;var c=document.getElementById("browser-ie-con").offsetWidth;var g=(a-c)/2;document.getElementById("browser-ie-con").style.left=g+"px";document.getElementById("browser-close").attachEvent("onclick",function(){document.getElementById("browser-update-ie").style.display="none"},false)}})(); </script>