zoukankan      html  css  js  c++  java
  • python 简书用户爬虫

    python 简书用户爬虫 广度优先

     1 # python
     2 # -*- coding: utf-8 -*-
     3 """
     4 __title__ = ''
     5 __author__ = 'wlc'
     6 __mtime__ = '2017/10/15'
     7 """
     8 import re
     9 import time
    10 import math
    11 import csv
    12 import requests
    13 from bs4 import BeautifulSoup
    14 from collections import deque
    15 import sys
    16 #python 默认递归限制为900
    17 sys.setrecursionlimit(10000)
    18 
    19 #建立一个csv文件保存信息
    20 path = 'dataCollection/userInfo.csv'
    21 csvFile = open(path, 'a+', newline='', encoding='utf-8')
    22 writer = csv.writer(csvFile)
    23 writer.writerow(('id','name','following','follower','article','word','like'))
    24 
    25 #全局变量用来存储userid 和关注的人数
    26 idContainer = set()
    27 #用来放置用户的链接使用双向队列
    28 linkDeque  = deque()
    29 
    30 class jianshu(object):
    31     def __init__(self):
    32         #定制url模板
    33         self.url = 'http://www.jianshu.com/users/{userId}/following?page={page}'
    34         #用户id与name的匹配规则
    35         self.idPattern = re.compile('<a class="name" href="/u/(.*?)">(.*?)</a>')
    36         #用户的关注 粉丝 文章 文集 的匹配规则
    37         self.metalPattern = re.compile('<span>关注 (d+)</span><span>粉丝 (d+)</span><span>文章 (d+)</span>')
    38         self.meta = re.compile('写了 (d+) 字,获得了 (d+) 个喜欢')
    39         #伪装成浏览器
    40         self.header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"}
    41 
    42     def createRequest(self, userId, page):
    43         url = self.url.format(userId = userId, page = page)
    44         requ = requests.get(url, headers = self.header).text
    45         return requ
    46 
    47     def pageResponse(self, requ):
    48         bsOBJ = BeautifulSoup(requ, 'lxml')
    49         userContainer = bsOBJ.find_all('ul',{'class':'user-list'})[0]
    50         userContent = userContainer.contents
    51         userContent = [str(user) for user in userContent if user != '
    ']
    52         #关注用户列表
    53         return userContent
    54 
    55     def parserUserInfo(self, user):
    56         id, name = re.findall(self.idPattern, user)[0]
    57         followingNum, followerNum, articleNum = re.findall(self.metalPattern, user)[0]
    58         try:
    59             wordNum, likeNum = re.findall(self.meta, user)[0]
    60         except:
    61             wordNum, likeNum = 0, 0
    62         content = (id, name, followingNum, followerNum, articleNum, wordNum, likeNum)
    63         writer.writerow(content)
    64         return  content
    65 
    66     def getUserList(self, userId, following):
    67         idContainer.add((userId, following))
    68         num = int(following) / 10
    69         page = math.ceil(num)
    70         for pg in range(1, page + 1, 1):
    71             requ = self.createRequest(userId, pg)
    72             userList = self.pageResponse(requ)
    73             for user in userList:
    74                 content = self.parserUserInfo(user)
    75                 linkDeque.append((content[0], content[2]))
    76             time.sleep(1)
    77         for deq in linkDeque:
    78             if deq not in idContainer:
    79                 self.getUserList(deq[0],deq[1])
    80                 print("what")
    81 jianshu = jianshu().getUserList('1562c7f16a04',45)
  • 相关阅读:
    注释代码片段
    更新docker时间-需要重启docker
    mysql随机查询若干条数据的方法
    Linux 块设备驱动 (一)
    Linux SD/MMC/SDIO驱动分析
    【转】Alsa音频编程【精华】
    goahead webserver源码分析
    【转】Linux系统调用列表
    Arm Linux系统调用流程详细解析
    Socket 相关的知识
  • 原文地址:https://www.cnblogs.com/wlc297984368/p/7673777.html
Copyright © 2011-2022 走看看