zoukankan      html  css  js  c++  java
  • python爬虫与数据采集

    #python爬虫技术
    #1-1 需要通过python的urllib或者request建立请求通信机制
    #1导入python的url库,请求库,用于与网页进行通信请求
    '''
    from urllib.request import urlopen
    url="https://www.python.org/"
    response=urlopen(url)
    content=response.read()
    #需要解码
    content=content.decode("utf-8")
    print(content)

    #2直接URLopen打开的方式太直白,有时候我们需要委婉一点进行请求

    import urllib.request
    url="https://www.python.org/"
    request=urllib.request.Request(url)
    response=urllib.request.urlopen(request)
    content=response.read().decode("utf-8")
    print(response.geturl())
    print(response.info())
    #打印请求码
    print(response.getcode()) #如果是200是成功
    print(type(response))

    #3request请求库,建立请求通信机制
    import requests
    res=requests.get("http://www.python.org/")
    print(res.status_code)#状态码
    print(res.text) #纯文本信息取出来
    print(res.content) #不仅限于文本信息

    #设置请求头headers,设置请求为网页,而不是python程序,防止网页不允许
    import requests
    headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
    res=requests.get("http://www.python.org/",headers=headers)
    print(res) #纯文本信息取出来

    #1-2 解析网页数据
    #使用BeautifulSoup函数可以进行解析
    import requests
    from bs4 import BeautifulSoup
    headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
    url="http://news.qq.com/"
    soup=BeautifulSoup(requests.get(url=url,headers=headers).text.encode("utf-8"),"lxml")
    em=soup.find_all("em",attrs={"class":"f14 124"})
    for i in em:
    title=i.a.get_text()
    link=i.a["href"]
    print({"标题":title,
    "链接":link
    })

    #解析库lxml
    import requests
    import lxml
    from lxml import etree
    headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
    url="http://news.qq.com/"
    html=requests.get(url=url,headers=headers)
    con=etree.HTML(html.text)
    title=con.xpath('//em[@class="f14 124"]/a/text()')
    link=con.xpath('//em[@class="f14 124"]/a/@href')
    for i in zip(title,link):
    print({"标题":i[0],
    "链接":i[1]
    })

    #1-3 信息提取方式
    #1css选择器,select方法,xpath表达式,正则表达式
    import requests
    from bs4 import BeautifulSoup
    headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
    url="http://news.qq.com/"
    soup=BeautifulSoup(requests.get(url=url,headers=headers).text.encode("utf-8"),"lxml")
    em=soup.select('em[class="f14 124"] a')
    for i in em:
    title=i.get_text()
    link=i["href"]
    print({"标题":title,
    "链接":link
    })
    #2 xpath表达式
    import requests
    import lxml.html as HTML
    headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
    url="http://news.qq.com/"
    con=HTML.fromstring(requests.get(url=url,headers=headers).text)
    title=con.xpath('//em[@class="f14 124"]/a/text()')
    link=con.xpath('//em[@class="f14 124"]/a/@href')
    for i in zip(title,link):
    print({"标题":i[0],
    "链接":i[1]
    })
    '''
    #1-4 招聘数据采集-静态数据采集
    import requests
    from lxml import etree
    import pandas as pd
    from time import sleep
    import random

    #cookie
    cookie='_ga=JSESSIONID=ABAAABAABGGAAFDB41FBAEE3423BAB77758EF657C3B981D; WEBTJ-ID=2020%E5%B9%B43%E6%9C%8824%E6%97%A5113837-1710a9ee8274fd-07cfb0bfecbcec-4d015463-921600-1710a9ee8296c; user_trace_token=20200324113840-9c306f42-1785-49cc-852d-5782edc6b421; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGUID=20200324113840-4182d6ba-8d8f-48fb-97ec-b01c2051ab8b; _gat=1; sajssdk_2015_cross_new_user=1; index_location_city=%E5%8C%97%E4%BA%AC; TG-TRACK-CODE=index_navigation; SEARCH_ID=2b38b688f0674a5d99609b2e0e6dfaf6; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221710a9f0212417-0b54e87737aecb-4d015463-921600-1710a9f02140%22%2C%22%24device_id%22%3A%221710a9f0212417-0b54e87737aecb-4d015463-921600-1710a9f02140%22%7D; lagou_utm_source=A; X_HTTP_TOKEN=65013509d78b2a6e42312058514ff12787dc7a92a2; _gid=GA1.2.2122772491.1585021121; _ga=GA1.2.1048705963.1585021121; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1585021121; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1585021324; LGSID=20200324113840-d14fb07b-d1d3-4a29-958b-a2ccb9513c4e; LGRID=20200324114204-53389ffc-25c0-415a-8871-afa85e05ed33; Hm_lvt_9d483e9e48ba1faa0dfceaf6333de846=1585021121; Hm_lpvt_9d483e9e48ba1faa0dfceaf6333de846=1585021325'
    headers={
    'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
    "Cookie":cookie
    }
    #查看网页结构循环页数进行采集
    for i in range(1,3):
    sleep(random.randint(3,10))
    url="https://www.lagou.com/zhaopin/jiqixuexi/{}/?filterOption=3&sid=8652d786c2764b7fa533a9e22e915a3c".format(i)
    print("正在抓取第{}页...".format(i),url)
    #请求网页并且解析
    con=etree.HTML(requests.get(url=url,headers=headers).text.encode("utf-8"))
    #使用过xpath抽取各个目标字段
    job_name=[i for i in con.xpath('//a[@class="position_link"]/h3/text()')]
    job_address=[i for i in con.xpath('//a[@class="position_link"]/span/em/text()')]
    job_company=[i for i in con.xpath('//div[@class="company_name"]/a/text()')]
    job_links=[i for i in con.xpath('//div[@class="p_top"]/a/@href')]
    #获取详情页面链接之后采集信息
    job=[]
    for link in job_links:
    sleep(random.randint(3,10))
    con2=etree.HTML(requests.get(url=link,headers=headers).text)
    des=[[i.xpath("string(.)") for i in con2.xpath('//dd[@class="job_bt"]/div/p')]]
    job+=des
    break
    #对数据进行字典的封装
    datasets={
    "岗位名称":job_name,
    "工作地址":job_address,
    "公司":job_company,
    "任职要求":job_links
    }

    #数据转换为数据框并且保存为csv文件
    data=pd.DataFrame(datasets)
    data.to_csv("machine learning.csv")
    print(data.head())

  • 相关阅读:
    ASP.NET MVC 视图
    ASP.NET MVC 控制器
    ASP.NET MVC 入门
    SQL 兼容性设置
    this template attempted to load component assembly 'NuGet.VsiualStudio.interop,version=1.0.0.0 culture=neutral publickeytoken=0000000'. 解决方案
    SQL 时间函数
    [原创]PageOffice+MobOffice 电脑、移动跨平台Office文档在线流转解决方案
    [原创]Java调用PageOffice给Word中的Table赋值
    [原创]Java调用PageOffice在线打开数据库中保存的Word文件
    [原创]新版PageOffice V4.0为什么用弹出窗口的方式打开文件?
  • 原文地址:https://www.cnblogs.com/Yanjy-OnlyOne/p/12569059.html
Copyright © 2011-2022 走看看