zoukankan      html  css  js  c++  java
  • 爬虫第二篇:爬虫详解之存储数据

    将数据存储到CSV文件

    import urllib.request
    import re
    import csv
    
    url = 'https://maoyan.com/board/4?offset=10'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
    
    # 获取页面
    req = urllib.request.Request(url, headers=headers)
    res = urllib.request.urlopen(req)
    html = res.read().decode('utf-8')
    
    # 解析页面
    p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>',re.S)
    rList = p.findall(html)
    
    # 存储到CSV
    for r in rList:
        r = [r[0].strip(),r[1].strip(),r[2].strip()]
        with open('my1.csv','a',newline="") as f:
            # 创建写入对象
            writer = csv.writer(f)
            writer.writerow(r)

    将数据存储到mysql中

    import urllib.request
    import re
    import pymysql
    import warnings
    
    warnings.filterwarnings("ignore")
    
    url = 'https://maoyan.com/board/4?offset=10'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
               }
    
    # 创建数据库连接对象
    db = pymysql.connect(
        "localhost",
        "root",
        "123456",
        "spiderdb",
        charset="utf8")
    # 游标对象
    cursor = db.cursor()
    
    # 获取页面
    req = urllib.request.Request(url, headers=headers)
    res = urllib.request.urlopen(req)
    html = res.read().decode('utf-8')
    
    # 解析页面
    p = re.compile(
        '<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>', re.S)
    rList = p.findall(html)
    # 存储到CSV
    ins = 'insert into film(
              name,star,releasetime) 
              values(%s,%s,%s)'
    for r in rList:
        L = [r[0].strip(),
             r[1].strip(),
             r[2].strip()[5:15]
             ]
        cursor.execute(ins,L)
        db.commit()

    将数据存储到pymongo中

    import urllib.request
    import re
    import pymongo
    
    url = 'https://maoyan.com/board/4?offset=10'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
               }
    
    # 创建连接对象
    conn = pymongo.MongoClient("127.0.0.1", 27017)
    db = conn["spiderdb"]
    myset = db["film"]
    
    # 获取页面
    req = urllib.request.Request(url, headers=headers)
    res = urllib.request.urlopen(req)
    html = res.read().decode('utf-8')
    
    # 解析页面
    p = re.compile(
        '<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>', re.S)
    rList = p.findall(html)
    
    # 存储到pymongo
    for r in rList:
        d = {
            "name": r[0].strip(),
            "star": r[1].strip(),
            "releasetime": r[2].strip()
        }
        myset.insert_one(d)
  • 相关阅读:
    开发中几个时期该写的
    Java API Docs
    Cogs 1709. [SPOJ705]不同的子串 后缀数组
    Poj 3683-Priest John's Busiest Day 2-sat,拓扑排序
    Bzoj 1616: [Usaco2008 Mar]Cow Travelling游荡的奶牛 动态规划
    Bzoj 1856: [Scoi2010]字符串 卡特兰数,乘法逆元,组合数,数论
    Bzoj 1624: [Usaco2008 Open] Clear And Present Danger 寻宝之路 最短路,floyd
    Bzoj 1042: [HAOI2008]硬币购物 容斥原理,动态规划,背包dp
    Bzoj 2393: Cirno的完美算数教室 容斥原理,深搜
    Bzoj 1853: [Scoi2010]幸运数字 容斥原理,深搜
  • 原文地址:https://www.cnblogs.com/leijing0607/p/7736072.html
Copyright © 2011-2022 走看看