zoukankan      html  css  js  c++  java
  • 爬虫学习笔记整理一

    tips

    • 不论爬取哪个网页,都可以加上请求头信息

    requests使用代理

    import requests
    
    url = "http://httpbin.org/ip"#访问这个地址会返回访问者的ip地址
    proxies = {'http':'119.39.68.252:8118'}
    resp = requests.get(url, proxies=proxies)
    
    print(resp.text)
    

    lxml是用C语言写的,没有提示

    1. 解析html字符串,使用etree.HTML(htmlstr)进行解析
    2. 解析html文件,使用etree.parse(filepath,parser=etree.HTMLParser(encoding="utf-8"))

    • print(resp.text)#返回的是经过解码后的字符串,是str(unicode)类型,有时候会出现解码为乱码的情况,这时就需要自己指定解码方式
    • print(resp.content)#返回的是一个原生的字符串,就是从网页上抓取的没有经过处理的字符串,是bytes类型
    resp = requests.get(url, headers=HEADERS)
    text = resp.content.decode("gbk", errors="ignore")  #加上errors='ignore',不然会报错
    

    xpath语法

    trs = table.xpath(".//tr[position()>2]")
    for i in range(1,5):
        url = "https://www.gushiwen.org/default_%s.aspx" % i
        parse_page(url)
        
     imgs = html.xpath("//a[@class='col-xs-6 col-sm-3']//img[@class!='gif']")    
        for img in imgs:
            img_url = img.get('data-original')#获取标签的某个属性值
    

    json字符串操作

    在python中,只有基本数据类型才可以转换为json格式的字符串,即
    int、float、str、list、dict、tuple

    import json
    
    # 1.将python对象转换成json串
    persons = [
        {'name':'张三', 'age':18, 'gender':'男'},
        {'name':'张四', 'age':17, 'gender':'女'},
        {'name':'张五', 'age':19, 'gender':'男'}
        ]
    
    json_str = json.dumps(persons)
    print(type(json_str), json_str)
    
    # 2.将json数据存储到文件
    with open("person.json", "w", encoding="utf-8") as f:
        #f.write(json_str)#方式一
        json.dump(persons, f, ensure_ascii=False)#方式二:直接将python对象dump到文件中,就不用再转换一步为字符串
    
    
    # 将json字符串load成python对象
    # 3.直接从文件中读取
    jsonstr = '[{"name": "u5f20u4e09", "age": 18, "gender": "u7537"}, {"name": "u5f20u56db", "age": 17, "gender": "u5973"}, {"name": "u5f20u4e94", "age": 19, "gender": "u7537"}]'
    persons = json.loads(jsonstr)
    for p in persons:
        print(p)
      
    print("-" * 50)
    
    # 5.直接从文件读取
    with open("person.json", "r", encoding="utf-8") as f:
        persons = json.load(f)
        print(type(persons))
        for p in persons:
            print(p)
    

    存储为csv文件

    import csv
    
    # =============================================================================
    # 读取csv文件
    # =============================================================================
    def read_csv_by_index():
        """以索引模式读取"""
        with open("5-冀东水泥集团问题汇总【36家】.csv", "r", encoding="utf-8") as f:
            #reader其实是一个迭代器
            reader = csv.reader(f)
            next(reader)#指针向下挪动一行
            for row in reader:
                hazard = {}
                # print(row)
                express = row[1]
                ytype = row[2]
                company = row[-1]
                if express != '' and ytype != '' and company != '':
                    hazard = {'express': express, 'ytype': ytype, 'company': company}
                    print(hazard)
    
    def read_csv_by_dict():
        with open("5-冀东水泥集团问题汇总【36家】.csv", "r", encoding="utf-8") as f:
            #使用DicReader创建的不会包含第一行表头,此时的迭代器reader返回的是一个字典
            reader = csv.DictReader(f)
            for line in reader:
                print(line)
        pass
        
    # =============================================================================
    # 写入数据到csv文件
    # =============================================================================
    def write_list_to_csv():
        #定义表头
        headers = ['username','age','sex']
        
        #要写入的数据
        data = [
            ('张三', 12, '男'),    
            ('李三', 19, '男'),
            ('张五', 28, '男'),
            ('王小二', 18, '女'),
        ]
        
        #encoding="utf-8", newline=""两个参数分别解决乱码和多余空行的问题
        with open("person_list.csv", "w", encoding="utf-8", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(headers)#写入表头
            writer.writerows(data)#写入数据
        pass
    
    def write_dict_to_csv():
        headers = ['username','age','sex']
        data = [
            {'username':'张三', 'age': 18, 'sex':'男'},    
            {'username':'李三', 'age': 16, 'sex':'女'},    
            {'username':'张五', 'age': 18, 'sex':'女'},    
            {'username':'王小二', 'age': 19, 'sex':'男'},    
        ]
        
        with open("person_dict.csv", "w", encoding="utf-8", newline="") as f:
            writer = csv.DictWriter(f, headers)
            writer.writeheader() #这种字典方式需要调用方法才能写入表头       
            writer.writerows(data)
        pass
    
    
    if __name__ == "__main__":
        write_dict_to_csv()
    

    正则模块

    title = re.sub(r'[??.。!!]', '', title)#处理掉文件路径中不符合要求的字符
    suffix = os.path.splitext(img_url)[1]#获取图片后缀名
    
    #写入图片到文件
    data = requests.get(img_url).content
        with open("images/"+filename, "wb") as f:
            f.write(data)
    

    selenium+chromedriver

    chromedriver下载地址:
    http://chromedriver.storage.googleapis.com/index.html
    根据谷歌浏览器版本下载对应的驱动chromedriver

    import time
    from selenium import webdriver
    
    def automation(url):
        # driver_path是谷歌浏览器驱动chromedriver.exe的存放路径
        driver_path = "F:\python\chromedriver_win32\chromedriver.exe"
        driver = webdriver.Chrome(executable_path=driver_path)
        
        #接下来就可以使用driver去操作谷歌浏览器了
        driver.get(url)
        
        #获取网页源代码(一般都是通过属性)
        print(driver.page_source)
        """关闭浏览器"""
        time.sleep(5)
        # driver.close()#退出当前页面
        driver.quit()#退出浏览器
    
        """获取元素的方法一"""
        #获取输入框
        inputTag = driver.find_element_by_id("kw")
        inputTag.send_keys("python")
    
    
    if __name__ == "__main__":
        url = "http://www.baidu.com"
        automation(url)
    
  • 相关阅读:
    markdown转HTML,目录生成
    schedule与scheduleAtFixedRate之Timer源码分析
    rocketmq刷盘过程
    rocketmq消息存储概述
    Cassandra修改集群名称
    Cassandra读写性能测试
    rocketmq--push消费过程
    rocketmq消费负载均衡--push消费为例
    go反射实例
    JUnit4参数的使用
  • 原文地址:https://www.cnblogs.com/zxfei/p/12075262.html
Copyright © 2011-2022 走看看