zoukankan      html  css  js  c++  java
  • bs+selenium +python 爬取 免费代理IP 储存在mongo

    #*encoding=utf-8
    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver import ActionChains
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait
    from selenium.webdriver.chrome.options import Options as COptions
    from urllib.parse import quote
    from pyquery import PyQuery as pq
    import pymongo
    import time
    import datetime
    from bs4 import BeautifulSoup
    
    
    options = COptions()
    options.add_argument('-headless')
    browser = webdriver.Chrome(executable_path="D:PDFpy包chromedriver_win32chromedriver.exe",
                              chrome_options=options)
    # iphone x 搜索 界面改变 倒霉
    # browser = webdriver.Chrome(executable_path="D:PDFpy包chromedriver_win32chromedriver.exe")
    wait = WebDriverWait(browser, 30)
    # keyword = input("请输入关键词:")
    client = pymongo.MongoClient('mongodb://localhost:27017/')
    # 指定数据库
    # db = client.movie
    db = client['proxy']
    # 指定集合
    # collection = db.movies
    y = datetime.datetime.now().year
    m = datetime.datetime.now().month
    d = datetime.datetime.now().day
    collection = db['proxy'+str(y)+str(m)+str(d)]
    MAX_PAGE = 10
    
    
    # 获取页面
    def index_page(page):
            print("正在爬取", page, '页')
            try:
                # 将中文转换成url编码
                url = 'https://www.kuaidaili.com/free/inha/' + str(page)
                print(url)
                time.sleep(2)
                browser.get(url)
                get_proxys()
            except TimeoutException:
                # 超时就重来
                index_page(page)
    
    
    # 解析 获取
    def get_proxys():
        html = browser.page_source
        # print(html)
        # doc = pq(html)
        # ps = doc('tr').items()
        # print(len(list(ps)))
        # for item in ps:
        #         ip = item.find('tr [data-title="IP"]').text()
        #         # port = item.find('tr [data-title="PORT"]').text()
        #         # nmd = item.find('tr [data-title="匿名度"]').text()
        #         # type = item.find('tr [data-title="类型"]').text()
        #         # position = item.find('tr [data-title="位置"]').text()
        #         # speed = item.find('tr [data-title="响应速度"]').text()
        #         # last_time = item.find('tr [data-title="最后验证时间"]').text()
        #
        #         proxy = {
        #             'ip': ip,
        #             # 'port': port,
        #             # 'nmd': nmd,
        #             # 'type': type,
        #             # 'position': position,
        #             # 'speed': speed,
        #             # 'last_time': last_time,
        #         }
        #         print(proxy)
        #         save_to_mongo(proxy)
        soup = BeautifulSoup(html, 'lxml')
        for i, child in enumerate(soup.tbody.children):
            # print(child)
            soup1 = BeautifulSoup(str(child), 'lxml')
            for child1 in enumerate(soup1.children):
                # print(child1)
                soup2 = BeautifulSoup(str(child1), 'lxml')
                ip = soup2.select('td[data-title="IP"]')[0].get_text()
                port = soup2.select('td[data-title="PORT"]')[0].get_text()
                nmd = soup2.select('td[data-title="匿名度"]')[0].get_text()
                type = soup2.select('td[data-title="类型"]')[0].get_text()
                position = soup2.select('td[data-title="位置"]')[0].get_text()
                speed = soup2.select('td[data-title="响应速度"]')[0].get_text()
                last_time = soup2.select('td[data-title="最后验证时间"]')[0].get_text()
                proxy = {
                    'ip': ip,
                    'port': port,
                    'nmd': nmd,
                    'type': type,
                    'position': position,
                    'speed': speed,
                    'last_time': last_time,
                }
                print(proxy)
                save_to_mongo(proxy)
    
    
    # 保存数据
    def save_to_mongo(product):
        try:
            if collection.insert_one(product):
                print('保存成功')
        except Exception:
            print('保存失败')
    
    
    # 实行
    def main():
        for i in range(1, MAX_PAGE+1):
            index_page(i)
    
    
    main()
    

      

    你不能把坏习惯扔出窗外 但你可以一步步赶下电梯
  • 相关阅读:
    77777 77777(2) WriteUp 绕waf技巧学习
    简单sql注入学到的延时盲注新式攻击
    代码审计 => 74cms_v3.5.1.20141128 一系列漏洞
    mysql注入新姿势(数字与字符编码注入) hex,conv
    Netty 3升级Netty4实践
    微信小程序获取用户openid,头像昵称信息,后台java代码
    转:JSP 分页显示数据 (Oracle)
    从数据库提取数据通过jstl显示在jsp页面上
    转:微信生成二维码java
    转:微信开发获取地理位置实例(java,非常详细,附工程源码)
  • 原文地址:https://www.cnblogs.com/Ychao/p/9459776.html
Copyright © 2011-2022 走看看