zoukankan      html  css  js  c++  java
  • 练习1--利用python获取百度前3页搜索结果(可更改页数)

    1.代码

    import requests
    import os
    from re import findall,DOTALL,search
    from bs4 import BeautifulSoup
    from urllib import parse
    
    #1.通过关键字获取百度前5页的url
        # 参数:keyword,返回url列表
    #2.爬取每个url获取该url页面需求后缀的href
        # 参数:url,extension_word 返回该页面中所需后缀的url列表
    #3.分析每个url是否可以访问
    #4.以每行每个的格式写入txt
    
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
    }
    num = 0
    
    #通过百度搜索栏的base_url获取搜索出的url
    def parse_baidu_url(url):
        global headers,num
        url_list = []
        response = requests.get(url=url,headers=headers)
        response = response.content.decode("utf-8")
        soup = BeautifulSoup(response,"lxml")
        h3_labels = soup.find_all("h3",attrs={"class":"t"})
        for h3_label in h3_labels:
            a_labels = h3_label.find_all("a")
            for a_label in a_labels:
                href = a_label['href']
                #验证搜索结果中的url可用性
                try:
                    response = requests.get(href,headers=headers,timeout=3)
                    try:
                        if response.status_code == 200:
                            test_url = response.url
                            url_list.append(test_url)
                            #进度计数器
                            num = num + 1
                            print(num)
                        elif response.status_code == 302:
                            test_url = response.headers['Location']
                            url_list.append(test_url)
                            #进度计数器
                            num = num + 1
                            print(num)
                    except Exception as e:
                        pass
                except Exception as e:
                    pass
        return url_list
    
    #1.通过关键字获取百度前3页的url
    # 参数:keyword,返回url列表
    def get_baidu_url(keyword):
        url_list = []
        base_url = "https://www.baidu.com/s?wd={}&pn={}&ie=utf-8"
        for page in range(1,4):
            pn = (page - 1 )*10
            base_url = base_url.format(keyword,pn)
            url_list.append(parse_baidu_url(base_url))
        return url_list
    
    #2.爬取每个url获取该url页面需求后缀的href
    # 参数:url,extension_word 返回该页面中所需后缀的url列表
    def get_keyword_url(url,keyword):
        global headers
        response = requests.get(url=url,headers=headers).text
        hrefs = findall('<a.*?href=(".*?").*?>.*?</a>',response,DOTALL)
        #去重
        list = hrefs
        temp = 0  #比较数据的标识位
        for href in hrefs:
            for index,href_new in enumerate(list):
                if index > temp:
                    if href_new == href:
                        del hrefs[index]
            temp = temp+1
            hrefs = list #每次比较完将旧列表替换为新列表
        #在url中查询是否存在对应后缀
        print("[+] 去重完成")
        print(hrefs)
        url_list = []
        base_Domains = parse.urlparse(url)
        base_Domain = str(base_Domains[0])+"://"+str(base_Domains[1])
        for href in hrefs:
            filename = os.path.basename(href).strip(""")
            (shotname,extension) = os.path.splitext(filename)
            if extension == '.action' or extension == '.jsp' or extension == '.do':
                if "http://" in href or "https://" in href:
                    result_url = href.strip(""")
                    url_list.append(result_url)
                else:
                    temp = bool(search(r".*?..*?.*?/",href))
                    if temp == True:
                        result_url = str(base_Domains[0])+":"+ href.strip(""")
                        url_list.append(result_url)
                    else:
                        result_url = base_Domain+"/"+ href.strip(""")
                        url_list.append(result_url)
        print("[+] 关键字url提取完成")
        print(url_list)
        return url_list
    
    #3.验证可用性
    ls = []
    def check_url(list0):
        #递归遍历列表
        def getitem(l):
            global ls
            for item in l:
                if isinstance(item,list):
                    getitem(item)
                else:
                    ls.append(item)
        #check可用性
        getitem(list0)
        list3 = []
        print("[+] 元素递归遍历完成")
        print(ls)
        for url in ls:
            try:
                response = requests.get(url=url,headers=headers,timeout=3)
                if response.status_code == 200:
                    list3.append(url)
            except:
                pass
        print("[+] 元素可用性检查完成")
        return list3
    
    
    #4.列表写入文件
    def file_write_list(url_list):
        with open("url_list.txt","w",encoding="utf-8") as file:
            for url in url_list:
                file.write(url+"
    ")
        print("[+] 文件写入完成")
    
    
    #5.主函数
    def main():
        #获取百度关键字后的搜索url
        url_list1 = get_baidu_url("nihao")
        url_list1 = check_url(url_list1)
        #从每个url页面中提取所需关键词的url
        url_list4 = []
        for url in url_list1:
            url_list3 = get_keyword_url(url=url,keyword=".action")
            url_list4.append(url_list3)
        url_list4 = check_url(url_list4)
        file_write_list(url_list4)
    
    
    if __name__ == '__main__':
        main()
    
    
    
    
  • 相关阅读:
    Yii数据库操作增删改查-[增加查询更新删除 AR模式]
    Yii2 关于时间格式的用法
    常用Linux命令
    数据库设计规范
    PHP代码规范
    git fetch 的简单用法:更新远程代码到本地仓库及冲突处理
    yii2 URL重写 nginx的配置
    Linux下免安装mysql
    在CentOS下搭建自己的Git服务器
    开源技术推荐之个人使用心得
  • 原文地址:https://www.cnblogs.com/qianxinggz/p/11415488.html
Copyright © 2011-2022 走看看