zoukankan      html  css  js  c++  java
  • Python获取谷歌搜索结果

    由于公司项目需要获取谷歌搜索结果。可是谷歌搜索出来的结果大多都不足20页。直接上代码

    import time
    from pprint import pprint
    import requests
    from bs4 import BeautifulSoup
    
    
    class GoogleSpider:
        def __init__(self, **kwargs):
            self.keyword = kwargs.get("keyword")
    
        def __del__(self):
            pass
    
        def search(self, **kwargs) -> list:
            data = []
            if kwargs.get("keyword") is None:
                if self.keyword is None:
                    return []
                else:
                    query = self.keyword
            else:
                query = kwargs.get("keyword")
            query = query.replace(' ', '+')
            URL = f"http://google.com/search?q={query}"
            page = 1
            while True:
                try:
                    print("当前正在搜索【" + str(query) + "】,当前第" + str(page) + "页...")
                    USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
                    headers = {"user-agent": USER_AGENT}
                    resp = requests.get(URL, headers=headers, verify=True)
                    if resp.status_code == 200:
                        soup = BeautifulSoup(resp.content, "html.parser")
                        nextpage = soup.find_all("a", id="pnnext")
                        if len(nextpage) > 0 and 'href' in nextpage[0].attrs:
                            nextpage = str(nextpage[0]['href'])
                            URL = r"http://www.google.com" + nextpage
                            results = []
                            for g in soup.find_all('div', class_='g'):
                                anchors = g.find_all('a')
                                if anchors:
                                    if 'href' in anchors[0].attrs:
                                        link = anchors[0]['href']
                                        title_list = list(g.find_all('h3'))
                                        if len(title_list) > 0:
                                            title_str = title_list[0]
                                            title_soup = BeautifulSoup(str(title_str), 'html.parser')
                                            title_text = title_soup.get_text()
                                            title_soup.clear()
                                            item = {
                                                "title": title_text,
                                                "link": link
                                            }
                                            results.append(item)
                            data.append({
                                page: results
                            })
                            page += 1
                            resp.close()
                            time.sleep(1)
                        else:
                            break
                except Exception as e:
                    pprint(e)
                    break
            return data
    from GoogleSpider import GoogleSpider
    
    if __name__ == "__main__":
        gs = GoogleSpider()
        keyword = "china"
        data = gs.search(keyword=keyword)
        print(data)
  • 相关阅读:
    default.js 下的 setPromise(WinJS.UI.processAll());
    选择排序
    插入排序
    16、css实现div中图片占满整个屏幕
    21、解决关于 vue项目中 点击按钮路由多了个问号
    15、vue项目封装axios并访问接口
    17、在vue中引用移动端框架Vux:
    24、vuex刷新页面数据丢失解决办法
    18、git提交代码并将develop分支合并到master分支上
    20、解决Vue使用bus兄弟组件间传值,第一次监听不到数据
  • 原文地址:https://www.cnblogs.com/felixwan/p/15006516.html
Copyright © 2011-2022 走看看