1.代码
import requests
import os
from re import findall,DOTALL,search
from bs4 import BeautifulSoup
from urllib import parse
#1.通过关键字获取百度前5页的url
# 参数:keyword,返回url列表
#2.爬取每个url获取该url页面需求后缀的href
# 参数:url,extension_word 返回该页面中所需后缀的url列表
#3.分析每个url是否可以访问
#4.以每行每个的格式写入txt
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
num = 0
#通过百度搜索栏的base_url获取搜索出的url
def parse_baidu_url(url):
global headers,num
url_list = []
response = requests.get(url=url,headers=headers)
response = response.content.decode("utf-8")
soup = BeautifulSoup(response,"lxml")
h3_labels = soup.find_all("h3",attrs={"class":"t"})
for h3_label in h3_labels:
a_labels = h3_label.find_all("a")
for a_label in a_labels:
href = a_label['href']
#验证搜索结果中的url可用性
try:
response = requests.get(href,headers=headers,timeout=3)
try:
if response.status_code == 200:
test_url = response.url
url_list.append(test_url)
#进度计数器
num = num + 1
print(num)
elif response.status_code == 302:
test_url = response.headers['Location']
url_list.append(test_url)
#进度计数器
num = num + 1
print(num)
except Exception as e:
pass
except Exception as e:
pass
return url_list
#1.通过关键字获取百度前3页的url
# 参数:keyword,返回url列表
def get_baidu_url(keyword):
url_list = []
base_url = "https://www.baidu.com/s?wd={}&pn={}&ie=utf-8"
for page in range(1,4):
pn = (page - 1 )*10
base_url = base_url.format(keyword,pn)
url_list.append(parse_baidu_url(base_url))
return url_list
#2.爬取每个url获取该url页面需求后缀的href
# 参数:url,extension_word 返回该页面中所需后缀的url列表
def get_keyword_url(url,keyword):
global headers
response = requests.get(url=url,headers=headers).text
hrefs = findall('<a.*?href=(".*?").*?>.*?</a>',response,DOTALL)
#去重
list = hrefs
temp = 0 #比较数据的标识位
for href in hrefs:
for index,href_new in enumerate(list):
if index > temp:
if href_new == href:
del hrefs[index]
temp = temp+1
hrefs = list #每次比较完将旧列表替换为新列表
#在url中查询是否存在对应后缀
print("[+] 去重完成")
print(hrefs)
url_list = []
base_Domains = parse.urlparse(url)
base_Domain = str(base_Domains[0])+"://"+str(base_Domains[1])
for href in hrefs:
filename = os.path.basename(href).strip(""")
(shotname,extension) = os.path.splitext(filename)
if extension == '.action' or extension == '.jsp' or extension == '.do':
if "http://" in href or "https://" in href:
result_url = href.strip(""")
url_list.append(result_url)
else:
temp = bool(search(r".*?..*?.*?/",href))
if temp == True:
result_url = str(base_Domains[0])+":"+ href.strip(""")
url_list.append(result_url)
else:
result_url = base_Domain+"/"+ href.strip(""")
url_list.append(result_url)
print("[+] 关键字url提取完成")
print(url_list)
return url_list
#3.验证可用性
ls = []
def check_url(list0):
#递归遍历列表
def getitem(l):
global ls
for item in l:
if isinstance(item,list):
getitem(item)
else:
ls.append(item)
#check可用性
getitem(list0)
list3 = []
print("[+] 元素递归遍历完成")
print(ls)
for url in ls:
try:
response = requests.get(url=url,headers=headers,timeout=3)
if response.status_code == 200:
list3.append(url)
except:
pass
print("[+] 元素可用性检查完成")
return list3
#4.列表写入文件
def file_write_list(url_list):
with open("url_list.txt","w",encoding="utf-8") as file:
for url in url_list:
file.write(url+"
")
print("[+] 文件写入完成")
#5.主函数
def main():
#获取百度关键字后的搜索url
url_list1 = get_baidu_url("nihao")
url_list1 = check_url(url_list1)
#从每个url页面中提取所需关键词的url
url_list4 = []
for url in url_list1:
url_list3 = get_keyword_url(url=url,keyword=".action")
url_list4.append(url_list3)
url_list4 = check_url(url_list4)
file_write_list(url_list4)
if __name__ == '__main__':
main()