zoukankan      html  css  js  c++  java
  • 百度贴吧爬虫小程序源码

    # _*_ coding::utf_8 _*_
    import urllib.request
    import urllib.parse
    import os
    url='http://tieba.baidu.com/f?'
    start=int(input("请输入开始页码:"))
    end=int(input("请输入结束页码:"))
    name=input("请输入搜索贴吧的名字:")
    name1=urllib.parse.quote(name)
    num1=(start-1)*50
    num2=(end-1)*50
    if not os.path.exists(name):
        path = r'E:python'
        os.mkdir(path+name)
    for i in range(start,end):
        data={
        'kw':name1,
        'ie':'utf-8',
        'pn':i,
        }
        headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        }
        data=urllib.parse.urlencode(data)
        url+=data
        request=urllib.request.Request(url=url,headers=headers)
        response=urllib.request.urlopen(request)
        filename=name+'_'+str(i)+".html"
        filepath=path+name+'/'+filename
        with open(filepath,'wb') as fp:
            fp.write(response.read())
            print("下载完成第{n}页".format(n=i))
  • 相关阅读:
    python
    car-travel project
    数据库
    kafka笔记
    cloudera笔记
    上课笔记
    structured streaming
    SparkSQL
    流数据
    spark厦门大学
  • 原文地址:https://www.cnblogs.com/ybl20000418/p/11609596.html
Copyright © 2011-2022 走看看