zoukankan      html  css  js  c++  java
  • 红楼梦 + 写入 MySQL + MongoDB

    MySQL

    import requests
    import re
    import pymysql
    from bs4 import BeautifulSoup
    
    conn = pymysql.Connect(host='127.0.0.1', user='root', password='123123', database='hlm')
    cursor = conn.cursor()
    
    url = 'http://www.purepen.com/hlm/'
    headers={
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
    }
    res = requests.get(url, headers=headers)
    res.encoding='gbk'
    soup = BeautifulSoup(res.text, 'lxml')
    
    for tr_list in soup.find_all(name='tr'):
        td_data = list(tr_list.find_all(name='td'))
        # url = tr_list.find_all(name='a').href
        if len(td_data) == 4:
            section1 = td_data[0].text
            title1 = td_data[1].text
            url1 = str(td_data[1])
            url1 = 'http://www.purepen.com/hlm/' + re.findall('<td><a href="(.*?)">', url1)[0]
    
            section2 = td_data[2].text
            title2 = td_data[3].text
            url2 = str(td_data[3])
            url2 = 'http://www.purepen.com/hlm/' + re.findall('<td><a href="(.*?)">', url2)[0]
            print(section1, title2,url1,'
    ',
                  section2, title2,url2 )
    
            # content 是表名
            sql = "insert into content (section,title,url) values ('%s','%s','%s')"%(section1,title1,url1)
            sql2 = "insert into content (section,title,url) values ('%s','%s','%s')"%(section2,title2,url2)
            cursor.execute(sql)
            cursor.execute(sql2)
    
            # 记得提交
            conn.commit()
    
    cursor.close()
    conn.close()
    
    

    MongoDB

    import requests
    import re
    import pymysql
    from bs4 import BeautifulSoup
    from pymongo import MongoClient
    
    client = MongoClient('localhost', 27017)
    db = client['ljw']
    db = db.lj
    
    url = 'http://www.purepen.com/hlm/'
    headers={
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
    }
    res = requests.get(url, headers=headers)
    res.encoding='gbk'
    soup = BeautifulSoup(res.text, 'lxml')
    
    for tr_list in soup.find_all(name='tr'):
        td_data = list(tr_list.find_all(name='td'))
        # url = tr_list.find_all(name='a').href
        if len(td_data) == 4:
            section1 = td_data[0].text
            title1 = td_data[1].text
            url1 = str(td_data[1])
            url1 = 'http://www.purepen.com/hlm/' + re.findall('<td><a href="(.*?)">', url1)[0]
    
            section2 = td_data[2].text
            title2 = td_data[3].text
            url2 = str(td_data[3])
            url2 = 'http://www.purepen.com/hlm/' + re.findall('<td><a href="(.*?)">', url2)[0]
            print(section1, title2,url1,'
    ',
                  section2, title2,url2 )
            res = db.insert_one({'章节': section1, '标题': title1, '网址': url1})
            print(res)
    
    
    
  • 相关阅读:
    python+selenium+Chrome options参数
    selenium用法
    python-pytest学习(十九)-pytest分布式执行(pytest-xdist)
    python-pytest学习(十八)-运行上次失败用例(--lf和--ff)
    python-pytest学习(十七)-conftest.py作用范围
    python-pytest学习(十六)-fixture作用范围
    python-pytest学习(十六)-多个fixture和fixture直接相互调用
    python-pytest学习(十五)-fixture详解
    python-pytest学习(十四)配置文件pytest.ini
    python-pytest学习(十三)-fixture之autouse=True
  • 原文地址:https://www.cnblogs.com/kai-/p/12662845.html
Copyright © 2011-2022 走看看