zoukankan      html  css  js  c++  java
  • day_4:文本存储_1

    TXT

    from pyquery import PyQuery
    import re
    import json
    import requests
    
    
    def get_html(url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}
        try:
            r = requests.get(url, headers=headers)
            r.raise_for_status()
            return r.text
        except:
            print('status_code is not 200')
            return None
    
    def parse_time(str):
        txt = re.search('d{4}(-d{2}-d{2})*', str)
        return txt.group()
    
    def parse_html(html, f):
    
        doc = PyQuery(html)
        dd_nodes = doc('dl.board-wrapper')
        ranks = dd_nodes('.board-index').items()
        names = dd_nodes('.name').items()
        actors = dd_nodes('.star').items()
        times = dd_nodes('.releasetime').items()
        integers = dd_nodes('.integer').items()
        fractions = dd_nodes('.fraction').items()
    
        for rank, name, actor, ts, integer, fraction in zip(ranks, names, actors, times, integers, fractions):
            str = '
    '.join([rank.text(), name.text(), actor.text().replace('主演:', ''), parse_time(ts.text()), integer.text() + fraction.text()])
    
    if __name__ == '__main__':
        url = 'http://maoyan.com/board/4'
    
        with open('movie.txt', 'w') as f:
            for i in range(10):
                path = url + '?offset=' + str(i*10)
                print(path)
                html = get_html(path)
                if html:
                    parse_html(html, f)

    JSON

    json.loads(str)把字符串转为JSON对象

    json.dumps(JSON, indent=2, ensure_ascii=False)把JSON对象转换为字符串

    indent=2设置格式,2代表缩进字符数

    ensure_ascii=False解决乱码

    from pyquery import PyQuery
    import re
    import json
    import requests
    
    
    def get_html(url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}
        try:
            r = requests.get(url, headers=headers)
            r.raise_for_status()
            return r.text
        except:
            print('status_code is not 200')
            return None
    
    def parse_time(str):
        txt = re.search('d{4}(-d{2}-d{2})*', str)
        return txt.group()
    
    def parse_html(html, f):
        doc = PyQuery(html)
        dd_nodes = doc('dl.board-wrapper')
        ranks = dd_nodes('.board-index').items()
        names = dd_nodes('.name').items()
        actors = dd_nodes('.star').items()
        times = dd_nodes('.releasetime').items()
        integers = dd_nodes('.integer').items()
        fractions = dd_nodes('.fraction').items()
    
        for rank, name, actor, ts, integer, fraction in zip(ranks, names, actors, times, integers, fractions):
            data = {
                'rank': rank.text(),
                'name': name.text(),
                'actor': actor.text().replace('主演:', ''),
                'time': parse_time(ts.text()),
                'score': integer.text() + fraction.text()
            }
            f.write(json.dumps(data, indent=2, ensure_ascii=False))
    
    
    if __name__ == '__main__':
        url = 'http://maoyan.com/board/4'
    
        with open('movie_json.txt', 'w') as f:
            for i in range(10):
                path = url + '?offset=' + str(i*10)
                print(path)
                html = get_html(path)
                if html:
                    parse_html(html, f)

     CSV

    import csv
    
    with open('data.csv', 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')  # delimiter默认:,
        writer.writerow(['id', 'name', 'age'])
        writer.writerow(['1001', 'mike', 20])
        writer.writerow(['1002', 'bob', 22])
        writer.writerow(['1003', 'jordan', 21])
        # writer.writerows([['1001', 'mike', 20], ['1002', 'bob', 22], ['1003', 'jordan', 21]])  #写入多行

    CSV存入字典类型数据

    import csv
    
    with open('data.csv', 'w') as csvfile:
        fieldnames = ['id', 'name', 'age']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)   # fieldnames=fieldnames设置title
        writer.writeheader()
        writer.writerow({'id': 1001, 'name': 'mike', 'age': 20})
        writer.writerow({'id': 1002, 'name': 'bob', 'age': 22})
        writer.writerow({'id': 1003, 'name': 'char', 'age': 24})
    # 追加数据
    
    import csv
    
    with open('data.csv', 'a', encoding='utf-8') as csvfile:   # encoding编码
        fieldnames = ['id', 'name', 'age']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)   # fieldnames=fieldnames设置title
        writer.writeheader()
        writer.writerow({'id': 1001, 'name': '', 'age': 20})
        writer.writerow({'id': 1002, 'name': '', 'age': 22})
        writer.writerow({'id': 1003, 'name': '', 'age': 24})
    #  读取
    
    import csv
    
    with open('data.csv', 'r', encoding='utf-8') as csvfile:   # encoding编码
        reader = csv.reader(csvfile)
        for row in reader:
            print(row)
    from pyquery import PyQuery
    import csv
    import re
    import requests
    
    
    def get_html(url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}
        try:
            r = requests.get(url, headers=headers)
            r.raise_for_status()
            return r.text
        except:
            print('status_code is not 200')
            return None
    
    
    def parse_time(str):
        txt = re.search('d{4}(-d{2}-d{2})*', str)
        return txt.group()
    
    
    
    def parse_html(html):
        doc = PyQuery(html)
        dd_nodes = doc('dl.board-wrapper')
        ranks = dd_nodes('.board-index').items()
        names = dd_nodes('.name').items()
        actors = dd_nodes('.star').items()
        times = dd_nodes('.releasetime').items()
        integers = dd_nodes('.integer').items()
        fractions = dd_nodes('.fraction').items()
        with open('movie.csv', 'a', encoding='utf-8') as csvfile:
            fieldnames = ['rank', 'name', 'actor', 'time', 'score']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            for rank, name, actor, ts, integer, fraction in zip(ranks, names, actors, times, integers, fractions):
                data = {
                    'rank': rank.text(),
                    'name': name.text(),
                    'actor': actor.text().replace('主演:', ''),
                    'time': parse_time(ts.text()),
                    'score': integer.text() + fraction.text()
                }
                writer.writerow(data)
    
    
    if __name__ == '__main__':
        url = 'http://maoyan.com/board/4'
    
        with open('movie.csv', 'w', encoding='utf-8') as csvfile:
            fieldnames = ['rank', 'name', 'actor', 'time', 'score']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
        for i in range(10):
            path = url + '?offset=' + str(i*10)
            print(path)
            html = get_html(path)
            if html:
                parse_html(html)
  • 相关阅读:
    Java集合一
    集合类视图
    NIO
    IO补充
    线程图
    线程池
    Callable
    element-ui upload组件上传图片时限制图片宽高
    vue-router 使用a链接跳转至二级子页面偶尔会出现地址栏看不见路由和参数的情况
    vue中引入jquery报错问题
  • 原文地址:https://www.cnblogs.com/jp-mao/p/10009707.html
Copyright © 2011-2022 走看看