zoukankan      html  css  js  c++  java
  • 腾讯微博用户关注与听众的爬取

    按广度的方式爬取用户的关注和听众,腾讯微博已经停运了,网上找的登入代码已经都过时了,自己分析不出来,就直接把cookie复制下了,这样就能获取要登录的内容了。

    由于停运,只能获取40页的内容,文件格式为[source,target] 表示source 关注 target。由于从source爬取的话可以从关注里找到target,而从target爬取的话会从听众找到source,所以就需要写个去重了。

    一小时大概能获取2万条消息。就一路写下去,没用线程。

    辣鸡代码如下:

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    '''
    @auther: Starry
    @file: Tencentweibo.py
    @time:  2018/7/15 9:50
    '''
    
    import requests
    from bs4  import BeautifulSoup
    from queue import Queue
    import time
    import datetime
    import json
    import csv
    import os
    
    
    cookies = {
        
    }
    
    headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "Host": "api.t.qq.com",
        "Pragma": "no-cache",
        "Referer": "http://api.t.qq.com/proxy.html",
        "rf": "http://t.qq.com/anjianbin1979/following?t=1#u=anjianbin1979&t=1&st=1&p=2",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
    }
    
    
    
    class TencentWeibo:
        COUNT = 0
        def __init__(self, start_name, start_title):
            self.start_name = start_name
            self.start_titile = start_title
            self.que = Queue()
            self.nameToId = {}
            self.current_num = 1
            self.visName = []
            self.unique = {}
            self.init_exe()
    
        def init_exe(self):
            if not os.path.exists('information.csv'):
                self.csv_information = csv.writer(open('information.csv','a',newline='',encoding='utf-8'),dialect='excel')
                self.csv_information.writerow(['id','user','name'])
                self.csv_information.writerow([1,self.start_name,self.start_titile])
                self.que.put(self.start_name)
                self.nameToId[self.start_name] = self.current_num
                self.unique[self.nameToId[self.start_name]] = []
            else:
                with open('information.csv','r',encoding='utf-8') as f:
                    csvFile = csv.reader(f,dialect='excel')
                    for index, item in enumerate(csvFile):
                        if index == 0:continue
                        self.que.put(item[1])
                        self.nameToId[item[1]] = int(item[0])
                        self.current_num = int(item[0])
                self.csv_information = csv.writer(open('information.csv', 'a', newline='', encoding='utf-8'),
                                                  dialect='excel')
            if not os.path.exists('data.csv'):
                self.csv_data = csv.writer(open('data.csv', 'a', newline='', encoding='utf-8'), dialect='excel')
                self.csv_data.writerow(['Source', 'Target'])
            else:
                FLAG = 0
                with open('data.csv', 'r', encoding='utf-8') as f:
                    csvFile = csv.reader(f, dialect='excel')
                    for index, item in enumerate(csvFile):
                        if index==0:continue
                        id1, id2 = int(item[0]),int(item[1])
                        if id1 not in self.unique.keys():
                            self.unique[id1] = []
                        if id2 not in self.unique.keys():
                            self.unique[id2] = []
                        self.unique[id1].append(id2)
                        FLAG = min(id1,id2)
                while not self.que.empty():
                    name = self.que.get()
                    id = self.nameToId[name]
                    if id == FLAG:
                        break
                    else:
                        self.visName.append(name)
                self.csv_data = csv.writer(open('data.csv', 'a', newline='', encoding='utf-8'), dialect='excel')
            print('开始爬取啦!!!')
    
        def DealHtml(self,html, Flag, name):
            soup = BeautifulSoup(html, 'html.parser')
            li = soup.find_all('div', attrs={"class": "userName"})
            for chlid in li:
                try:
                    id = chlid.find('a').get('href')[1:]
                    title = chlid.find('a').string
                    if id not in self.nameToId.keys():
                        self.current_num += 1
                        self.nameToId[id] = self.current_num
                        self.que.put(id)
                        self.csv_information.writerow([self.current_num, id, title])
                        if self.nameToId[id] not in self.unique.keys():
                            self.unique[self.nameToId[id]] = []
                    # if self.COUNT == 1000:
                    #     print('已经爬取了%s条消息了'%self.current_num)
                    #     self.COUNT = 0
                    # self.COUNT += 1
                    if Flag == 1:
                        # print("关注",id,title)
                        if self.nameToId[id] not in self.unique[self.nameToId[name]]:
                            self.unique[self.nameToId[name]].append(self.nameToId[id])
                            # print([self.nameToId[name],self.nameToId[id]])
                            self.csv_data.writerow([self.nameToId[name], self.nameToId[id]])
                    elif Flag == 2:
                        # print("粉丝",id, title)
                        if self.nameToId[name] not in self.unique[self.nameToId[id]]:
                            self.unique[self.nameToId[id]].append(self.nameToId[name])
                            # print([self.nameToId[id], self.nameToId[name]])
                            self.csv_data.writerow([self.nameToId[id], self.nameToId[name]])
                except Exception as e:
                    print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),e)
        def getFans(self, name):
            for i in range(1, 41):
                try:
                    ctime = str(int(time.time() * 1000))
                    url = "http://api.t.qq.com/relations/follow_apollo.php?u={0}&t=2&st=1&p={1}&apiType=14&apiHost=http://api.t.qq.com&_r={2}&g_tk=325301840".format(
                        name, str(i), ctime)
                    ret = requests.get(url=url, headers=headers, cookies=cookies,timeout=10)
                    ret_json = json.loads(ret.text)
                    if "info" in ret_json.keys():
                        self.DealHtml(ret_json['info'], 2, name)
                    else:
                        break
                except Exception as e:
                    print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),e)
        def getIdol(self, name):
            for i in range(1, 41):
                try:
                    ctime = str(int(time.time() * 1000))
                    url = "http://api.t.qq.com/relations/follow_apollo.php?u={0}&t=1&st=1&p={1}&apiType=14&apiHost=http://api.t.qq.com&_r={2}&g_tk=325301840".format(
                        name, str(i), ctime)
                    ret = requests.get(url=url, headers=headers, cookies=cookies,timeout=10)
                    ret_json = json.loads(ret.text)
                    if "info" in ret_json.keys():
                        self.DealHtml(ret_json['info'], 1, name)
                    else:
                        break
                except Exception as e:
                    print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),e)
        def start(self):
            while not self.que.empty():
                visiter = self.que.get()
                if visiter not in self.visName:
                    self.visName.append(visiter)
                    self.getIdol(visiter)
                    self.getFans(visiter)
    
    
    class TencentWeiboArticles:
        def __init__(self):
            self.que = Queue()
            self.IdToInformation = {}
            # self.
        def start(self):
            pass
    weibo = TencentWeibo('xie_na','谢娜')
    weibo.start()
    

      

  • 相关阅读:
    Lucence.Net 2.9.3 日期范围搜索
    Frida 使用
    ubuntu+php5fpm 下安装 memcached PHP扩展
    cmd下使用telnet连接到memcached服务器操作
    解决sendmail卡死和主机名为bogon的问题
    【转载】Win7文件关联 文件与程序“联姻”
    [转载]Ubuntu下Samba服务器的最简配置
    Windows XP快速关机
    [转载]Git安装以及使用Git 管理个人文档
    GitHub push时提示“fatal: The remote end hung up unexpectedly”
  • 原文地址:https://www.cnblogs.com/xingkongyihao/p/9342259.html
Copyright © 2011-2022 走看看