zoukankan      html  css  js  c++  java
  • 爬虫大杂烩

    爬虫大杂烩

    """
    # 1 爬拉钩职位信息
    import requests
    
    headers = {
        'Accept-Language': "zh-CN,zh;q=0.9",
        'Host': 'www.lagou.com',
        'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
        'Referer': "https://www.lagou.com/jobs/list_python?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput=",
        'Cookie': "index_location_city=%E4%B8%8A%E6%B5%B7; user_trace_token=20200303202747-787f5b5e-8819-4d60-a8c0-3920aaf97b87; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22170a05dd252be-062a9d067fa6cc-366b420b-1049088-170a05dd25333f%22%2C%22%24device_id%22%3A%22170a05dd252be-062a9d067fa6cc-366b420b-1049088-170a05dd25333f%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2279.0.3945.130%22%7D%7D; _ga=GA1.2.442852312.1586218701; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1586218701; LGUID=20200407081821-ea0bc584-cc75-43f0-9aa2-3c6fbe25bd10; JSESSIONID=ABAAAECAAFDAAEHA77B0A7162DFBDB833136F9E1BB7A309; WEBTJ-ID=20200407081848-1715200fade24c-0cb3e5dd9dd159-366b420b-1049088-1715200fadf446; _putrc=75D0A37619AD39A0123F89F2B170EADC; login=true; unick=%E5%8D%A0%E4%BA%9A%E5%B3%B0; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; privacyPolicyPopup=false; index_location_city=%E4%B8%8A%E6%B5%B7; TG-TRACK-CODE=search_code; X_HTTP_TOKEN=ed5749058ca1359c4174036851b1e35881c33e2f3e; gate_login_token=7c10fb5f4a047e902fb2a37fe1f50c11a9127b60c1b4a449e8fbaf21a885afc7; _gid=GA1.2.1134112076.1586353092; _gat=1; PRE_UTM=; PRE_HOST=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fshanghai; PRE_SITE=https%3A%2F%2Fwww.lagou.com; LGSID=20200408213812-42f9d697-f383-473d-bf8d-0b78af930d27; hasDeliver=24; LGRID=20200408213814-dd89ae8a-ec86-4878-a2fb-863cec451b35; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1586353093",
        'Accept': "application/json, text/javascript, */*; q=0.01",
        'X-Anit-Forge-Code': "0",
        'X-Anit-Forge-Token': None,
        'X-Requested-With': 'XMLHttpRequest'
    
    }
    form_data = {
        'first': 'false',
        'pn': 1,
        'kd': 'python'
    }
    ret=requests.post('https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false',
                     headers=headers,data=form_data)
    print(ret.text)
    
    
    # 2 爬cnblogs新闻
    import requests
    from bs4 import BeautifulSoup
    ret=requests.get('https://www.cnblogs.com/sitehome/p/3')
    soup=BeautifulSoup(ret.text,'lxml')
    
    article_list=soup.find_all(class_='post_item')
    for article in article_list:
        title=article.find(class_='titlelnk').text
        href=article.find(class_='titlelnk')['href']
        desc=article.find(class_='post_item_summary').text
        author=article.find(class_='lightblue').text
        print('''
        文章标题:%s
        文章地址:%s
        文章摘要:%s
        文章作者:%s
        '''%(title,href,desc,author))
    
    # 爬红楼梦小说
    import requests
    from bs4 import BeautifulSoup
    
    ret = requests.get('http://www.shicimingju.com/book/hongloumeng.html')
    soup = BeautifulSoup(ret.text, 'lxml')
    
    li_list = soup.find(class_='book-mulu').find_all(name='li')
    with open("红楼.txt", 'w', encoding='utf-8') as f:
        for li in li_list:
            title = li.find(name='a').text
            url = li.find(name='a')['href']
            # print(title)
            f.write(title + '
    ')
            ret_detail = requests.get('http://www.shicimingju.com' + url)
            soup2 = BeautifulSoup(ret_detail.text, 'lxml')
            content = soup2.find(class_='chapter_content').text
            f.write(content + '
    ')
            print(title, "写入")
    
    
    # 微信机器人
    from wxpy import *
    from pyecharts import Pie
    import webbrowser
    bot=Bot(cache_path=True) #注意手机确认登录
    
    friends=bot.friends()
    #拿到所有朋友对象,放到列表里
    attr=['男朋友','女朋友','未知性别']
    value=[0,0,0]
    for friend in friends:
        if friend.sex == 1: # 等于1代表男性
            value[0]+=1
        elif friend.sex == 2: #等于2代表女性
            value[1]+=1
        else:
            value[2]+=1
    
    
    pie = Pie("朋友男女比例")
    pie.add("", attr, value, is_label_show=True)
    #图表名称str,属性名称list,属性所对应的值list,is_label_show是否现在标签
    pie.render('sex.html')#生成html页面
    # 打开浏览器
    webbrowser.open("sex.html")
    from wxpy import *
    bot=Bot(cache_path=True)
    
    @bot.register()
    def recv_send_msg(recv_msg):
        print('收到的消息:',recv_msg.text) # recv_msg.text取得文本
        return '好的'
    
    # 进入Python命令行,让程序保持运行
    embed()
    
    
    
    # 爬糗事百科
    import requests
    from bs4 import BeautifulSoup
    ret=requests.get('https://www.qiushibaike.com/text/page/2/')
    # print(ret.text)
    
    soup=BeautifulSoup(ret.text,'lxml')
    
    article_list=soup.find_all(class_='article')
    # print(article_list)
    for article in article_list:
        content=article.find(class_='content').text
        print(content)
        print('-------')
    
    
    
    # 爬肯德基门店
    import requests
    
    header = {
        'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
    }
    data = {
        'cname': '',
        'pid': 20,
        'keyword': '浦东',
        'pageIndex': 1,
        'pageSize': 10
    }
    ret = requests.post('http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword', data=data, headers=header)
    print(ret.text)
    
    """
    
  • 相关阅读:
    in_array()和explode()的使用笔记
    写sql语句连接的时候注意的一个小细节
    在thinkphp框架模板中引用session
    查询数据库所有(某个)表中字段名,数据类型,说明等,导出数据字典
    (委托事件处理)关于多线程执行显示进度条的实例(转)&&线程间操作无效: 从不是创建控件“rtxtEntryNO”的线程访问它。
    判断当前线程所处的状态 (转)以及终止当前线程
    string 字符串的分隔处理与list的相互转换
    C# 动态调用webservice
    C#中的List<string>泛型类示例
    命名空间"system.web"中不存在类型或命名空间名称security"
  • 原文地址:https://www.cnblogs.com/yafeng666/p/12663421.html
Copyright © 2011-2022 走看看