zoukankan      html  css  js  c++  java
  • 第一周学习总结

    这一周我学习了python的一些基本语法和函数 以及如何从不同类型的网站爬虫

    爬取了丁香园-的疫情数据还有拉勾网,猫眼的一些信息

    学到两种方法,一种是xpath方法,一种是正则表达式

    xpath

    import requests
    from lxml import etree
    import xlwt
    url='http://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId=AH20021900306'
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
    }
    
    
    response=requests.post(url,headers=headers)
    
    
    
    res_html=etree.HTML(response.text)//将爬取出的数据解析为要xpath的形式
    dd_list=res_html.xpath('//*[@id="f_baner"]/div[1]/div[1]/div[2]/strong/text()')[0].strip()
    print(dd_list)

    提取选择的文字,.strip()将空格切割出去

     还有要爬取的数据就在源代码上,用正则表达式

    import requests
    import re
    import xlwt
    
    url = 'https://maoyan.com/board/4?'
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
    }
    
    
    def get_page(url):
        try:
            response = requests.get(url, headers=headers)
            response.encoding = 'utf-8'
            if response.status_code == 200:
                return response.text
            else:
                print('获取网页失败')
        except Exception as e:
            print(e)
    
    
    def get_info(page):
        items = re.findall('.*?class="name"><a href=".*?" title="(.*?)".*?">.*?</a></p>.*?<p class="releasetime">(.*?)</p>',page,re.S)//(.*?)表示你要爬取的内容
        for item in items:
            data = {}
            data['name'] = item[0]
            data['time'] = item[1]
            # print (123)
            print(data)
            yield data
    
    
    urls = ['https://maoyan.com/board/4?offset={}'.format(i * 10) for i in range(10)]
    DATA = []
    for url in urls:
        print (url)
        page = get_page(url)
        datas = get_info(page)
        for data in datas:
            DATA.append(data)  # 将所有的数据添加到DATA里
    
    f = xlwt.Workbook(encoding='utf-8')
    sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)
    sheet01.write(0, 0, 'name')  # 第一行第一列
    sheet01.write(0, 1, 'time')  # 第一行第一列
    # 写内容
    for i in range(len(DATA)):
        sheet01.write(i + 1, 0, DATA[i]['name'])
        sheet01.write(i + 1, 1, DATA[i]['time'])
    
        print('p', end='')
    f.save('F:\作业.xls')

    还有数据在json上是通过ajax来获取的

    import requests
    import time
    import json
    import xlwt
    urls='https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB/p-city_0?&cl=false&fromSearch=true&labelWords=&suginput='
    url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
    
    headers = {
        'Host': 'www.lagou.com',
        'Origin': 'https://www.lagou.com',
        'Referer': 'https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB/p-city_0?&cl=false&fromSearch=true&labelWords=&suginput=',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest'
    
    }
    
    
    # data参数在network/XHR中找到包含数据的网页,然后在该网页headers的最下面可以找到data参数。
    # 每一页url不变,但data里面的参数会变,是用来控制搜索关键字和页数的。
    def get_page(url, i):
        data = {
            'first': 'false',
            'kd': '爬虫',  # 搜索关键词  改关键字的时候,headers里面的referer也要改。
            'pn': i  # 第几页
        }
        try:
            s = requests.Session()
            s.get(urls, headers=headers, timeout=3)  # 请求首页获取cookies
            cookie = s.cookies  # 为此次获取的cookies
            response = s.post(url, data=data, headers=headers, cookies=cookie, timeout=3)  # 获取此次文本
            time.sleep(5)
    
            response.raise_for_status()
            response.encoding = response.apparent_encoding
    
            return response.json()
        except requests.ConnectionError as e:
            print('Error:', e.args)
    
    
    def get_info(json):
        json = json['content']['positionResult']['result']
    
        for item in json:
            data = {}
            data['companyFullName'] = item['companyFullName']
            data['city'] = item['city']
            data['district'] = item['district']
            data['education'] = item['education']
            data['financeStage'] = item['financeStage']
            data['industryField'] = item['industryField']
            data['companySize'] = item['companySize']
            data['positionName'] = item['positionName']
            data['salary'] = item['salary']
            data['workYear'] = item['workYear']
    
            yield data
    
    
    DATA = []
    for i in range(1, 3):
        json = get_page(url, i)
    
        print('请求:' + str(i) + url)
        time.sleep(2)
        datas = get_info(json)  # 如果这一步出了问题,那么可能就是因为上一步就出了问题。
        for data in datas:
            DATA.append(data)
    '''
    with open('./拉勾网.text','w',encoding='utf-8') as fp:
        fp.write(str(DATA))'''
    
    '''f = xlwt.Workbook(encoding='utf-8')
    sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)
    sheet01.write(0, 0, 'companyFullName')  # 第一行第一列
    sheet01.write(0, 1, 'city')
    sheet01.write(0, 2, 'district')
    sheet01.write(0, 3, 'education')
    sheet01.write(0, 4, 'financeStage')
    sheet01.write(0, 5, 'positionName')
    sheet01.write(0, 6, 'salary')
    sheet01.write(0, 7, 'workYear')
    sheet01.write(0, 8, 'companySize')
    sheet01.write(0, 9, 'industryField')
    
    # 写内容
    for i in range(len(DATA)):
        sheet01.write(i + 1, 0, DATA[i]['companyFullName'])
        sheet01.write(i + 1, 1, DATA[i]['city'])
        sheet01.write(i + 1, 2, DATA[i]['district'])
        sheet01.write(i + 1, 3, DATA[i]['education'])
        sheet01.write(i + 1, 4, DATA[i]['financeStage'])
        sheet01.write(i + 1, 5, DATA[i]['positionName'])
        sheet01.write(i + 1, 6, DATA[i]['salary'])
        sheet01.write(i + 1, 7, DATA[i]['workYear'])
        sheet01.write(i + 1, 8, DATA[i]['companySize'])
        sheet01.write(i + 1, 9, DATA[i]['industryField'])
        print('p', end='')
    f.save(u'F:\爬虫.xls')
    '''
  • 相关阅读:
    超级简单的分屏控件【自定义PictureBox 继承于UserControl】
    《(学习笔记)两天进步一点点》(3)——应用BindingSource实现数据同步
    《(学习笔记)两天进步一点点》(5)——几个比较小的类,但很实用
    【唠叨两句】如何将一张树型结构的Excel表格中的数据导入到多张数据库表中
    《(学习笔记)两天进步一点点》(2) ——BindingSource基础操作
    微软通用类库——DbHelper
    经典的SQL语句
    ToString 中的常见格式
    【学习笔记】SQL Server 中的批量复制操作 (ADO.NET)
    《(学习笔记)两天进步一点点》(1)——Windows控件DGV
  • 原文地址:https://www.cnblogs.com/zlj843767688/p/12345261.html
Copyright © 2011-2022 走看看