zoukankan      html  css  js  c++  java
  • python 爬虫

    # -*- coding:utf-8 -*-
    __version__ = '1.0.0.0'
    """
    @brief : 简介
    @details: 详细信息
    @author : zhphuang
    @date : 2019-02-22
    """
    import os
    import time
    import random
    import requests
    import urllib.request
    from selenium import webdriver
    from bs4 import BeautifulSoup



    class Spider(object):
    """
    爬取类
    """
    def __init__(self):
    options = webdriver.ChromeOptions()

    # 设置chrome浏览器无界面模式
    # options.add_argument('--headless')
    #options.add_experimental_option("excludeSwitches", ["ignore-certificate-errors"])
    #options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2,
    # 'profile.default_content_setting_values': {'notifications': 2}}) # 不加载图片
    # self.browser = webdriver.PhantomJS(executable_path=self._getdriverpath())
    self.browser = webdriver.Chrome(self._getdriverpath(), chrome_options=options)
    # self.browser.implicitly_wait(60)

    def get_info(self):
    pass

    def quit(self):
    self.browser.close()

    def _getdriverpath(self):
    # path = os.path.join(os.path.split(__file__)[0], "phantomjs")
    path = os.path.join(os.path.split(__file__)[0], "chromedriver")
    # path = "C://chromedriver.exe"
    return path


    class UserNameSpider(Spider):

    def __init__(self):
    self.agents = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
    "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
    "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
    ]
    # super(UserNameSpider,self).__init__()

    def get_info(self):
    page = 2
    while page < 10:
    url = "https://bbs.jj.cn/forumdisplay.php?fid=173&page=%s" % page

    header = {
    'path': '/forumdisplay.php?fid=173&page=%s' % page,
    'referer': 'https://bbs.jj.cn/',
    'User-agent': random.choice(self.agents),
    'scheme': "https",
    'authority': 'bbs.jj.cn',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'zh-CN,zh;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'upgrade-insecure-requests': '1',
    "cookie": "UM_distinctid=1690eef0bfef2-0211bcc91dff28-36617102-1aeaa0-1690eef0bff6c7; Hm_lvt_65699696998080926ad677627de4c418=1550733742; qDH_visitedfid=173D254; qDH_oldtopics=D6937765D6937910D; Hm_lvt_c22ac8657ced9bd55e529ce7a0e1f7d9=1550835844; Hm_lpvt_c22ac8657ced9bd55e529ce7a0e1f7d9=1550835844; JJFormHashKey=912f18049dae7a13e03df54d7d72ca93; User_Id=733493473; User_Nick=d0c2cad6373333343933343733; FigureId=0; PartnerId=0; UserCookieKey=c937381015d9ce02d9a368faf1047195; u_ltime=1550835920; u_pass=_f200921f20362b33e26d534087c2d174; UserLoginInfo=18502789819%2C1%2C1550835920%2Cfdbf48be798373a6c77749a4283d572b; qDH_jjuid=733493473; qDH_cookietime=2592000; qDH_auth=32c7Ba7mL69GzoDwAp%2BYhulsbpodwzfokm%2FkX76Rlw4X%2Fv0yL7wkzP1XpFRjwUbFyVtNaupGuhztyemXr5sloJanzARf%2FsIW; qDH_sid=r7TFZe; CNZZDATA4054856=cnzz_eid%3D1428032294-1550729010-%26ntime%3D1550835718; qDH_onlineusernum=2801; Hm_lpvt_65699696998080926ad677627de4c418=1550836316"

    }
    res = requests.get(url, headers=header, timeout=30)
    # self.browser.get(url)
    bs = BeautifulSoup(res.text, "html.parser")
    tbodys = bs.select("table#forum_173 > tbody")
    for tbody in tbodys:
    link = "https://bbs.jj.cn/viewthread.php?tid=" + tbody.attrs["id"].split("_")[1]
    try:
    res = requests.get(link, headers=header, timeout=30)
    except Exception as e:
    continue
    bs2 = BeautifulSoup(res.text, "html.parser")
    div_list = bs2.select("div#postlist > div ")
    for div in div_list:
    user_name = div.select("a.user_nick2")[0].text
    avatar = div.select("div.avatar > a > img")[0].attrs["src"]
    print(user_name, avatar)
    urllib.request.urlretrieve(avatar, 'images/%s.jpg' % user_name)
    page += 1


    if __name__ == '__main__':
    UserNameSpider().get_info()


    当值未一旬,而视茫茫,而发苍苍,而齿牙动摇
  • 相关阅读:
    VB.NET中vbcr 是回车、vbcrlf 是回车和换行的结合、vblf 是换行
    COM组件简介
    【转】ACE编程小结
    socket基础实例(一个服务端对应一个客户端情形)
    服务器中判断客户端socket断开连接的方法
    阻塞、非阻塞的概念和select函数的阻塞功能
    socket基础函数(2)
    线程初级基础(一)
    给程序员的五点建议--如何成为编程高手并以此创业
    Linux下常用软件
  • 原文地址:https://www.cnblogs.com/niuniuc/p/10423772.html
Copyright © 2011-2022 走看看