zoukankan      html  css  js  c++  java
  • 微博抓取尝试

    微信是比较封闭的,微博的好友信息比较开放,都可以抓到;

    1)找人,通过关注列表;
    2)提取出微博的数据,放到数据库;

    微博昵称,头像;
    关注,粉丝及微博数量;
    根据一些基本的原则来决定是否将该用户的微博入待爬的队列;

    指标:关注人数;
    粉丝人数;但是有可能会很多人,而且有很多僵尸粉;(不好:第一,低效;
    第二,平台也不会让你无限制的往下翻页,肯定会有限制)
    微博数,粉丝数的数量是个重要的参考点;

    怎么判断抓取的人不值得关注?可以先做一个定向的分析,分析你所抓的领域的人的微博大致情况;
        1)如果发布的微博数量特别少,可以认为是僵尸用户,不用爬;微博数小于某个下限;
        2)如果发布的微博数量特别多,比如每天发100多条,可能是小广告商或者机器人;
        3)对于转发的微博其实和僵尸的微博差不多,你可能爬了大量的微博发现都是重复的信息;

    # 下面列出部分代码如下:

    # -*- coding: utf-8 -*-
    """
    Created on Sun Apr  1 10:18:42 2018

    @author: Joe3223
    """
    # -*- coding:utf-8 -*-
    #!/usr/bin/env python3
    import time
    import os
    import re
    from bs4 import BeautifulSoup
    from urllib.request import urlopen
    from selenium import webdriver
    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    #import pymongo
    #from pymongo import MongoClient
    import hashlib
    from collections import deque
    from lxml import etree
    import threading

    # 数据库的准备,这里用的是mongodb;
    #client = MongoClient('localhost',27017)
    #db = client.test
    #followers = db.followers

    # 注意:这里如果不设置user-agent,可能是无法跳转的
    user_agent = (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) " +
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36"
    )
    ##dcap = dict(DesiredCapabilities.PHANTOMJS)
    ##dcap["phantomjs.page.settings.userAgent"] = user_agent
    dcap = dict(DesiredCapabilities.FIREFOX)
    dcap["firefox.page.settings.userAgent"] = user_agent

    #browserPath = '/opt/phantomjs-2.1.1-linux-x86_64/bin/phantomjs'
    #browserPath = '/usr/bin/phantomjs'
    # 基本参数的一些准备工作
    parser = 'html5lib'
    domain = "weibo.com"
    url_home = "http://" + domain
    download_bf = deque()                # 双向队列,用于保证多线程爬取是安全的
    cur_queue = deque()
    min_mblogs_allowed = 10              # 爬取的阈值设置
    max_follow_fans_ratio_allowed = 3


    # 这里有两个爬虫,一个爬取微博数据,一个爬取用户数据
    weibo_driver = webdriver.Firefox()  # 微博爬虫
    weibo_driver.set_window_size(1920, 1200)  # optional

    # url入队列,当然,入队列前要先做查重    
    def enqueueUrl(url):
        try:
            md5v = hashlib.md5(url).hexdigest()
            if md5v not in download_bf: # 去重
                print(url + ' is added to queue')
                cur_queue.append(url)
                download_bf.append(md5v)
            # else:
                # print 'Skip %s' % (url)
        except ValueError:
            pass

    # 队列左端弹出一个值
    def dequeuUrl():
        return cur_queue.popleft()

    # 到下一页取抓取        
    def go_next_page(cur_driver):
        try:
            next_page = cur_driver.find_element_by_xpath('//a[contains(@class, "page next")]').get_attribute('href')
            print('next page is ' + next_page)
            cur_driver.get(next_page)
            time.sleep(3)
            return True
        except Exception:
            print('next page is not found')
            return False

    # 通过xpath尝试获取元素,最多尝试6次    
    def get_element_by_xpath(cur_driver, path):
        tried = 0
        while tried < 6:
            html = cur_driver.page_source
            tr = etree.HTML(html)
            elements = tr.xpath(path)
            if len(elements) == 0:
                time.sleep(1)
                continue
            return elements

    # 滚屏,保证能抓到数据            
    def scroll_to_bottom():
        # 最多尝试 50 次滚屏
        print('scroll down')
        for i in range(0,50):
            # print 'scrolling for the %d time' % (i)
            weibo_driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
            html = weibo_driver.page_source
            tr = etree.HTML(html)
            next_page_url = tr.xpath('//a[contains(@class,"page next")]')
            if len(next_page_url) > 0:
                return next_page_url[0].get('href')
            if len(re.findall('点击重新载入', html)) > 0:
                print('scrolling failed, reload it')
                weibo_driver.find_element_by_link_text('点击重新载入').click()
            time.sleep(1)

    # 提取微博数据
    def extract_feed(feeds):
        for i in range(0,20):
        # 只有在抓取微博数据时需要滚屏
            scroll_to_bottom()
            for element in weibo_driver.find_elements_by_class_name('WB_detail'):
                tried = 0
                while tried < 3:
                    try:
                        feed = {}
                        feed['time'] = element.find_element_by_xpath('.//div[@class="WB_from S_txt2"]').text
                        feed['content'] = element.find_element_by_class_name('WB_text').text
                        feed['image_names'] = []
                        for image in element.find_elements_by_xpath('.//li[contains(@class,"WB_pic")]/img'):
                            feed['image_names'].append(re.findall('/([^/]+)$', image.get_attribute('src')))
                        feeds.append(feed)
                        print('--------------------')
                        print(feed['time'])
                        print(feed['content'])
                        break
                    except Exception:
                        tried += 1
                        time.sleep(1)
            # 微博信息的下一页
            if go_next_page(weibo_driver) is False:
                return feeds

    def getFollows(pageInfo):
        pattern3 = re.compile('class="S_txt1" title="(.*?)".*?usercard')
        follows = re.findall(pattern3, pageInfo)
        print(follows)
        for i in follows:
            print(i)
            #follower = {"name":i,"type":"follower"}
            #rs = followers.insert_one(follower)
            #print('one insert:{0}'.format(rs.inserted_id))
        
        ##urlsToScrawl = []
        ##urlsScrawled = []
        patterUrls = re.compile('<a bpfilter="page" class="page S_txt1"[\s\S]*?href="([\s\S]*?pids=Pl_Official_RelationMyfollow__92&amp;cfs=&amp;Pl_Official_RelationMyfollow__92_page=[\s\S]*?)"')
        follows = re.findall(patterUrls, pageInfo)
        for i in follows:
            print("http://weibo.com/"+i)
            ##if i not in urlsScrawled and i not in urlsToScrawl:
            ##urlsToScrapy.append("http://weibo.com/"+i)    


    def login(current_driver,username, password):
        #driver = webdriver.PhantomJS(executable_path=browserPath)  #浏览器的地址
        #driver = webdriver.PhantomJS(desired_capabilities=dcap)
        #driver = webdriver.Firefox()
        #driver.set_window_size(1920, 1200)
        
        current_driver.get(url_home)  #访问目标网页地址
        #bsObj = BeautifulSoup(user_driver.page_source, parser)  #解析目标网页的 Html 源码
        time.sleep(10)
        #user_driver.save_screenshot("weiboLogin0.png")

        # 登录
        current_driver.find_element_by_id('loginname').send_keys(username)
        #user_driver.find_element_by_id('password').send_keys(password)
        #user_driver.find_element_by_xpath('//div[contains(@class,"input_wrap ")][0]/input').send_keys(password)
        current_driver.find_element_by_xpath('/html/body/div[1]/div[1]/div/div[2]/div[1]/div[2]/div/div[2]/div[1]/div[2]/div[1]/div/div/div/div[3]/div[2]/div/input').send_keys(password)
        # 执行 click()
        current_driver.find_element_by_xpath('//div[contains(@class,"login_btn")][1]/a').click()
        time.sleep(8)
        current_driver.save_screenshot("weiboLogin.png")

        ##verifyCode = input("Please input verify code:")            
        ##user_driver.find_element_by_xpath('/html/body/div[1]/div[1]/div/div[2]/div[1]/div[2]/div/div[2]/div[1]/div[2]/div[1]/div/div/div/div[3]/div[3]/div/input').send_keys(verifyCode)
        ##user_driver.find_element_by_xpath('//div[contains(@class,"login_btn")][1]/a').click()
        ##time.sleep(8)
        ##user_driver.save_screenshot("weiboLogin2.png")
        


    def main(username, password):
        # 登录
        #login(user_driver,username, password)
        login(weibo_driver,username, password)
        
        # 等会操作
        time.sleep(30)
        #user_driver.save_screenshot("weibo.png")
              
        ## 从大V的入口进去爬取,真正的URL入口
        user_link = "https://weibo.com/u/3738542230?topnav=1&wvr=6&topsug=1&is_hot=1"
        print('downloading ' + user_link)
        weibo_driver.get(user_link)
        time.sleep(5)
        
        # 提取用户姓名
        account_name = get_element_by_xpath(weibo_driver, '//h1')[0].text
        photo = get_element_by_xpath(weibo_driver, '//p[@class="photo_wrap"]/img')[0].get('src')
        account_photo = re.findall('/([^/]+)$', photo)
        # 提取他的关注主页
        follows_link = get_element_by_xpath(weibo_driver, '//a[@class="t_link S_txt1"]')[0].get('href')
        print('account: ' + account_name)
        print('account_photo: '+account_photo[0])
        print('follows link is ' + follows_link)

        #user_driver.get("http"+follows_link)
        feeds = []
        #users = []
         # 起一个线程取获取微博数据
        t_feeds = threading.Thread(target=extract_feed, name=None, args=(feeds,))
        t_feeds.start()
        t_feeds.join()
        

    if __name__ == '__main__':
        main("你的用户","你的密码")
        #login(user_driver,"570876459@qq.com", "xiaowuwu!!!")
        #login(weibo_driver,username, pass

  • 相关阅读:
    Java自学-数组 创建数组
    Java自学-控制流程 结束外部循环
    Java自学-控制流程 break
    Java自学-控制流程 for
    Java自学-控制流程 continue
    Java自学-控制流程 switch
    Java自学-控制流程 If
    计算机组成原理之流水线处理器
    计算机组成原理之算术逻辑单元
    计算机组成原理之指令系统
  • 原文地址:https://www.cnblogs.com/daluozi/p/9466430.html
Copyright © 2011-2022 走看看