zoukankan      html  css  js  c++  java
  • 寒假大数据学习笔记十五

      完成了数据爬取工作,共33335条数据,全部保存入库。

      1 import requests
      2 from fake_useragent import UserAgent
      3 from lxml import etree
      4 import re
      5 import pymysql
      6 import time
      7 
      8 
      9 def open_file(file):
     10     original_id = []
     11     f = open(file, "r")
     12     for line in f:
     13         original_id.append(line.splitlines()[0])
     14     f.close()
     15     return original_id
     16 
     17 
     18 def open_url(url, file, type):
     19     original_id = open_file(file)
     20     for id in original_id:
     21         detail_url = url + id
     22         header = {
     23             "User-Agent": UserAgent().random
     24         }
     25         req = requests.get(detail_url, headers=header)
     26         html = etree.HTML(req.text)
     27         try:
     28             question_title = html.xpath(
     29                 '//div[contains(@class,"col-xs-10 col-sm-10")]//strong/text()')[0].strip()
     30             question_date = html.xpath(
     31                 '//div[contains(@class,"col-xs-5 col-lg-3")]/text()')[0].strip()
     32             question_content = html.xpath(
     33                 '//div[contains(@class,"col-xs-12 col-md-12 column p-2")]//text()')
     34             # if type == "投诉":
     35             #     reply_organ = html.xpath(
     36             #         '//div[contains(@class,"col-xs-9 col-sm-7")]//span/text()')[0].strip()
     37             reply_organ = html.xpath(
     38                 '//div[contains(@class,"col-xs-9 col-sm-7")]/text()')[1].strip()
     39             reply_date = html.xpath(
     40                 '//div[contains(@class,"col-xs-12 col-sm-3")]/text()')[0].strip()
     41             reply_content = html.xpath(
     42                 '//div[contains(@class,"col-xs-12 col-md-12 column p-4")]//text()')
     43             date_pattern = re.compile(r"(d{4}-dd-dd)")
     44             print(question_title)
     45             q_date = date_pattern.findall(question_date)[0]
     46             print(q_date)
     47             q_con = "".join(question_content).strip()
     48             print(q_con)
     49             print(reply_organ)
     50             r_date = date_pattern.findall(reply_date)[0]
     51             print(r_date)
     52             r_con = "".join(reply_content).strip()
     53             print(r_con)
     54             print(type)
     55             r = add(open_conn("letter"),
     56                     question_title,
     57                     q_date,
     58                     q_con,
     59                     reply_organ,
     60                     r_date,
     61                     r_con,
     62                     id,
     63                     type)
     64             print(r)
     65         except IndexError as e:
     66             pass
     67         time.sleep(0.5)
     68         print("=" * 20)
     69 
     70 
     71 def open_conn(dbname):
     72     db = pymysql.connect(
     73         host="localhost",
     74         port=3306,
     75         user="root",
     76         passwd="123456",
     77         db=dbname,
     78         charset="utf8")
     79 
     80     return db
     81 
     82 
     83 def add(
     84         db,
     85         question_title,
     86         question_date,
     87         question_content,
     88         reply_organ,
     89         reply_date,
     90         reply_content,
     91         original_id,
     92         type):
     93 
     94     cursor = db.cursor()
     95     sql = "insert into detail_letter(question_title,question_date,question_content,reply_organ,reply_date,reply_content,original_id,type) values(%s,%s,%s,%s,%s,%s,%s,%s)"
     96     cursor.execute(
     97         sql,
     98         [question_title,
     99          question_date,
    100          question_content,
    101          reply_organ,
    102          reply_date,
    103          reply_content,
    104          original_id,
    105          type])
    106     db.commit()
    107     db.close()
    108     return "数据插入成功!"
    109 
    110 
    111 if __name__ == '__main__':
    112     # open_url(
    113     #     "http://www.beijing.gov.cn/hudong/hdjl/com.web.complain.complainDetail.flow?originalId=",
    114     #     "tousu.txt",
    115     #     "投诉")
    116     # open_url(
    117     #     "http://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId=",
    118     #     "zixun.txt",
    119     #     "咨询")
    120     open_url(
    121         "http://www.beijing.gov.cn/hudong/hdjl/com.web.suggest.suggesDetail.flow?originalId=",
    122         "jianyi.txt",
    123         "建议")
  • 相关阅读:
    go 基本包
    go 包
    算法笔记--数据结构--链表
    算法笔记--数据结构--队列
    算法笔记--标准模板库STL--pair
    算法笔记--标准模板库STL--stack
    算法笔记--标准模板库STL--priority_queue
    算法笔记--标准模板库STL--queue
    初识pair
    lower_bound实现离散化
  • 原文地址:https://www.cnblogs.com/YXSZ/p/12317478.html
Copyright © 2011-2022 走看看