the idea is to first construct the url list all_url
and then
for i in range (0, len (all_url)):
urlqueue.put (all_ URL [I])
then get can pull url from the list every time
now the problem is that range cannot be written as 0 to list length
will show IndexError: list index out of range
which means index error: list index is out of range
and there is no problem with the list, there is no empty
and if the list length is 2000, then only range (0, 1000), so there are no errors
, which is troublesome
here is the code
import requests
from lxml import html
import time
import threading
from queue import Queue
class Spider (threading.Thread):
def __init__(self, name, urlqueue):
super().__init__()
self.name = name
self.urlqueue = urlqueue
def run(self):
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36"
}
print(" :" + self.name + "")
while not self.urlqueue.empty():
try:
url = self.urlqueue.get()
rep = requests.get(url, headers = headers, timeout = 5)
time.sleep(1)
if rep.status_code == 200:
print("")
self.parse(rep)
print(url + " ")
except Exception as e:
print("::" +url + " , ::", e)
pass
print(" :" + self.name + "")
def parse(self, rep):
con = rep.content
sel = html.fromstring(con)
title = sel.xpath("//div[@class="titmain"]/h1/text()")
title = str(title).replace("]", "").replace("[", "").replace(""", "").replace(",", "").replace(r"\r\n", "").replace(""", "").replace(" ", "").replace(r"\xa0", "").replace("?", "").replace("/", "").replace(r"\u3000", " ")
date = sel.xpath("//div[@class="texttit_m1"]/p/text()")
date = str(date).replace("]", "").replace("[", "").replace(""", "").replace(r"\u3000", " ")
if len(date) > 20:
file_name = title + ".txt"
a = open(file_name, "w+", encoding="utf-8")
a.write("\n" + str(title) + "\n" + "\n" + str(date))
print(file_name + "")
a.close
else:
pass
if name ="_ _ main__":
with open("url.txt") as f:
data = f.readline()
-sharp
james = data.strip().split(",")
-sharp
all_url = []
for jame in james:
a=eval(jame)
-sharpifu
all_url.append(a)
print(len(all_url))
start = time.time()
urlqueue = Queue()
threadNum = 3 -sharp
for i in range(0, 1468):
urlqueue.put(all_url[i]) -sharp
del all_url[i]
threads = []
for i in range(1, threadNum+1):
thread = Spider("" + str(i), urlqueue)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
with open("url.txt", "w+") as b:
b.write("\n".join([str(all_url)]))
b.write("\n" + "=" *50 + "\n")
b.close
print(" url ")
end = time.time()
print("-------------------------------")
print(". {}".format(end-start))
in addition, url is read from txt. I don"t know how to upload it. The final all_url list is definitely no problem.