as shown in the figure,
:
at first it was IndexError: list index out of range
I found that the response.text was all the same, but I couldn"t find the problem.
the specific code is as follows. Thank you.
import json
from urllib.parse import urlencode
import re
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
def get_page_index (offset,keyword):
data = {
"offset": offset,
"format": "json",
"keyword": keyword,
"autoload": "true",
"count": "20",
"cur_tab": 3
}
url = "https://www.toutiao.com/search_content/?"+urlencode(data)
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
print("")
return None
def parse_page_index (html):
data = json.loads(html)
if data and "data" in data.keys():
for item in data.get("data"):
yield item.get("article_url")
def get_page_detail (url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
print("",url)
return None
def parse_page_detail (html):
soup = BeautifulSoup(html,"lxml")
title = soup.select("title")[0].get_text()
print(title)
images_pattern=re.compile("gallery: JSON.parse (.*?);",re.S)
result = re.search(images_pattern,html)
if result:
print(result.group(1))
def main ():
html = get_page_index(0,"")
for url in parse_page_index(html):
html = get_page_detail(url)
if html:
parse_page_detail(html)
if _ _ name__=="__main__":
main()