zoukankan      html  css  js  c++  java
  • 红楼梦 + 写入 MySQL + MongoDB

    MySQL

    import requests
    import re
    import pymysql
    from bs4 import BeautifulSoup
    
    conn = pymysql.Connect(host='127.0.0.1', user='root', password='123123', database='hlm')
    cursor = conn.cursor()
    
    url = 'http://www.purepen.com/hlm/'
    headers={
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
    }
    res = requests.get(url, headers=headers)
    res.encoding='gbk'
    soup = BeautifulSoup(res.text, 'lxml')
    
    for tr_list in soup.find_all(name='tr'):
        td_data = list(tr_list.find_all(name='td'))
        # url = tr_list.find_all(name='a').href
        if len(td_data) == 4:
            section1 = td_data[0].text
            title1 = td_data[1].text
            url1 = str(td_data[1])
            url1 = 'http://www.purepen.com/hlm/' + re.findall('<td><a href="(.*?)">', url1)[0]
    
            section2 = td_data[2].text
            title2 = td_data[3].text
            url2 = str(td_data[3])
            url2 = 'http://www.purepen.com/hlm/' + re.findall('<td><a href="(.*?)">', url2)[0]
            print(section1, title2,url1,'
    ',
                  section2, title2,url2 )
    
            # content 是表名
            sql = "insert into content (section,title,url) values ('%s','%s','%s')"%(section1,title1,url1)
            sql2 = "insert into content (section,title,url) values ('%s','%s','%s')"%(section2,title2,url2)
            cursor.execute(sql)
            cursor.execute(sql2)
    
            # 记得提交
            conn.commit()
    
    cursor.close()
    conn.close()
    
    

    MongoDB

    import requests
    import re
    import pymysql
    from bs4 import BeautifulSoup
    from pymongo import MongoClient
    
    client = MongoClient('localhost', 27017)
    db = client['ljw']
    db = db.lj
    
    url = 'http://www.purepen.com/hlm/'
    headers={
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
    }
    res = requests.get(url, headers=headers)
    res.encoding='gbk'
    soup = BeautifulSoup(res.text, 'lxml')
    
    for tr_list in soup.find_all(name='tr'):
        td_data = list(tr_list.find_all(name='td'))
        # url = tr_list.find_all(name='a').href
        if len(td_data) == 4:
            section1 = td_data[0].text
            title1 = td_data[1].text
            url1 = str(td_data[1])
            url1 = 'http://www.purepen.com/hlm/' + re.findall('<td><a href="(.*?)">', url1)[0]
    
            section2 = td_data[2].text
            title2 = td_data[3].text
            url2 = str(td_data[3])
            url2 = 'http://www.purepen.com/hlm/' + re.findall('<td><a href="(.*?)">', url2)[0]
            print(section1, title2,url1,'
    ',
                  section2, title2,url2 )
            res = db.insert_one({'章节': section1, '标题': title1, '网址': url1})
            print(res)
    
    
    
  • 相关阅读:
    《Java 学习笔记》 第三章阅读体验
    《Java 学习笔记》 第四章阅读体验
    Android 自定义控件的几个步骤
    新的Android应用 记账理财助手 登陆国内各大市场啦。
    第一个 Android 应用发布到 Google Market 中了
    DoWhat 登录 AppChina应用汇啦
    开始阅读《Java 学习笔记》一书
    Android 开发的多分辨率自适应图片要点
    五种开源协议的比较(BSD,Apache,GPL,LGPL,MIT) – 整理
    在oracle中建立自动编号sql
  • 原文地址:https://www.cnblogs.com/kai-/p/12662845.html
Copyright © 2011-2022 走看看