zoukankan      html  css  js  c++  java
  • python+urllib+BeautifulSoup+pymysql

      1 # -*- coding: utf-8 -*-
      2 # @Time : 2019/11/12 21:22
      3 # @Author : AForever
      4 # @Site : 
      5 # @File : cnblog_002.py
      6 # @Software: PyCharm
      7 
      8 from urllib import request
      9 from bs4 import BeautifulSoup
     10 import os
     11 import pymysql
     12 
     13 
     14 # 获取数据
     15 def get_data():
     16     headers = {
     17         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
     18     }
     19 
     20     for i in range(7):
     21         url = "https://search.51job.com/list/040000,000000,0000,00,9,99,python%25E5%25BC%2580%25E5%258F%2591,2," + str(i+1) + ".html"
     22         req = request.Request(url, headers=headers)
     23         file_name = 'E:\python_project\Spider\cnblogs\data\cnblog_pythonjob' + str(i + 1) + '.html'
     24         print(file_name, url)
     25         response = request.urlopen(req)
     26 
     27         if response.getcode() == 200:
     28             data = response.read()
     29             data = str(data, encoding='gbk')
     30             with open(file_name, mode="w", encoding="gbk") as f:
     31                 f.write(data)
     32     print("*"*15, "get data success", "*"*15)
     33 
     34 
     35 # 解析数据
     36 def parse_data():
     37     path = 'E:\python_project\Spider\cnblogs\data\'
     38     filenames = os.listdir(path)
     39     result = []
     40     for filename in filenames:
     41         # print(filename)
     42         with open(path+filename, mode="r", encoding="gbk") as f:
     43             html = f.read()
     44             bs = BeautifulSoup(html, 'html.parser')
     45             divs = bs.select('#resultList .el')
     46             for div in divs[1:]:
     47                 title = div.select('.t1')[0].get_text(strip=True)
     48                 company = div.select('.t2')[0].get_text(strip=True)
     49                 addr = div.select('.t3')[0].get_text(strip=True)
     50                 salary = div.select('.t4')[0].get_text(strip=True)
     51                 pubdate = div.select('.t5')[0].get_text(strip=True)
     52                 row = {
     53                     'title': title,
     54                     'company': company,
     55                     'addr': addr,
     56                     'salary': salary,
     57                     'pubdate': pubdate
     58                 }
     59                 result.append(row)
     60         # print(result)
     61 
     62     print('*' * 15, 'parse data success, ,Congratulations!', '*' * 15)
     63     # print(result)
     64     return result
     65 
     66 
     67 # 创建数据表
     68 def create_table():
     69     config = {
     70         'host': 'localhost',
     71         'port': 3306,
     72         'user': 'root',
     73         'password': '123456',
     74         'database': 'python',
     75         'charset': 'utf8'
     76     }
     77     conn = pymysql.connect(**config)
     78     cursor = conn.cursor()
     79     # 如果存在student表,则先删除
     80     try:
     81         cursor.execute('DROP TABLE IF EXISTS `t_job`;')
     82         conn.commit()
     83         print('*' * 15, "drop table success", '*' * 15)
     84     except:
     85         print('*' * 15, 'table dose not exist', '*' * 15)
     86 
     87     create_table = '''
     88         create table t_job(
     89         id int primary key auto_increment,
     90         title varchar(200),
     91         company varchar(200),
     92         addr varchar(200),
     93         salary varchar(200),
     94         pubdate varchar(200)
     95         )engine=Innodb charset utf8;
     96         '''
     97     # 创建数据表
     98     cursor.execute(create_table)
     99     cursor.close()
    100     conn.close()
    101     print('*' * 15, 'create tables success,Congratulations!', '*' * 15)
    102 
    103 
    104 # 存储数据到mysql
    105 def save_to_mysql(data):
    106     config = {
    107         'host': 'localhost',
    108         'port': 3306,
    109         'user': 'root',
    110         'password': 'lem600@HW',
    111         'database': 'python',
    112         'charset': 'utf8'
    113     }
    114 
    115     conn = pymysql.connect(**config)
    116     cursor = conn.cursor()
    117     sql = '''
    118     insert into t_job(title, company, addr, salary, pubdate)
    119     values(%(title)s,%(company)s,%(addr)s,%(salary)s,%(pubdate)s)
    120     '''
    121     cursor.executemany(sql, data)
    122     conn.commit()
    123     cursor.close()
    124     conn.close()
    125     print('*' * 15, 'save data to mysql success ,Congratulations !', '*' * 15)
    126 
    127 
    128 if __name__ == "__main__":
    129     get_data()
    130     # parse_data()
    131     create_table()
    132     save_to_mysql(parse_data())
  • 相关阅读:
    vue源码分析—Vue.js 源码目录设计
    vue源码分析—认识 Flow
    在Windows上安装配置MongoDB
    mongoDB概述
    Could not load file or assembly Microsoft.Web.Infrastructure
    配置错误 不能在此路径中使用此配置节(转)
    VS2013快捷键大全
    Create new tool for CSV
    How to get http response.
    C#中Split用法
  • 原文地址:https://www.cnblogs.com/AForever01/p/11845976.html
Copyright © 2011-2022 走看看