write a program to grab weixin.sogou.com data, and CAPTCHA appears from time to time.
however, when I open weixin.sogou.com using a browser, there is no CAPTCHA.
doesn"t Sogou judge by IP?
Code:
def __weixin_search(self, keyword, cur_page, start_date, end_date, timeout=30):
html = None
while True:
try:
if self.__cookie is None:
self.__set_cookie(timeout=timeout)
if isinstance(keyword, types.UnicodeType):
keyword = keyword.encode("utf-8")
url = "http://weixin.sogou.com/weixin?type=2&s_from=input&ie=utf8&_sug_=n&_sug_type_=&query="
url += urllib2.quote(keyword)
headers = {
"User-Agent": random.choice(USER_AGENTS),-sharp
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8",
"Accept-Encoding": "gzip, deflate, sdch",
"DNT": "1",
"Connection": "keep-alive",
"Cookie": self.__cookie
}
r = requests.get(url=url, headers=headers, timeout=timeout)
html = r.text.encode("ISO-8859-1")
if html is None or html.find(u"seccodeInput") is not -1:
-sharp IP
self.__cookie = None
time.sleep(1)
else:
t = HTMLParser.HTMLParser()
return t.unescape(html)
except Exception as ex:
-sharpprint_msg(msg=traceback.format_exc(), save_to_log=True)
pass