zoukankan      html  css  js  c++  java
  • Python爬虫爬取搜狐视频电影并存储到mysql数据库

    代码:

      1 import time
      2 import traceback
      3 import requests
      4 from lxml import etree
      5 import re
      6 from bs4 import BeautifulSoup
      7 from lxml.html.diff import end_tag
      8 import json
      9 import pymysql
     10 #连接数据库  获取游标
     11 def get_conn():
     12     """
     13     :return: 连接,游标
     14     """
     15     # 创建连接
     16     conn = pymysql.connect(host="127.0.0.1",
     17                     user="root",
     18                     password="000429",
     19                     db="movierankings",
     20                     charset="utf8")
     21     # 创建游标
     22     cursor = conn.cursor()  # 执行完毕返回的结果集默认以元组显示
     23     if ((conn != None) & (cursor != None)):
     24         print("数据库连接成功!游标创建成功!")
     25     else:
     26         print("数据库连接失败!")
     27     return conn, cursor
     28 #关闭数据库连接和游标
     29 def close_conn(conn, cursor):
     30     if cursor:
     31         cursor.close()
     32     if conn:
     33         conn.close()
     34     return 1
     35 
     36 def get_souhu():
     37     url='https://film.sohu.com/list_0_0_0_2_2_1_60.html?channeled=1200100000'
     38     #最新上架
     39     new_url='https://film.sohu.com/list_0_0_0_2_1_1_60.html?channeled=1200100000'
     40     #本周热播
     41     week_url='https://film.sohu.com/list_0_0_0_2_0_1_60.html?channeled=1200100000'
     42     headers={
     43         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
     44     }
     45 
     46     #初始化list
     47     templist=[]
     48     dataRes=[]
     49     #最受好评
     50     for i in range(1,31):
     51         url_1='https://film.sohu.com/list_0_0_0_2_2_'
     52         auto=str(i)
     53         url_2='_60.html?channeled=1200100000'
     54         url=url_1+auto+url_2
     55         response = requests.get(url, headers)
     56         response.encoding = 'utf-8'
     57         page_text = response.text
     58         # etree_ = etree.HTML(page_text)
     59         # 获取所有的li
     60         soup = BeautifulSoup(page_text, 'lxml')
     61         # 标签层级选择
     62         li_list = soup.select('.movie-list>li')
     63         print(len(li_list))
     64         if(len(li_list)==0):
     65             print("最受好评爬取结束!")
     66             if(len(dataRes)!=0):
     67                 return dataRes
     68         for li in li_list:
     69             li_text=str(li)
     70             # print(li_text)
     71             li_soup=BeautifulSoup(li_text,'lxml')
     72             name=li_soup.find('div',class_="v_name_info").text
     73             #添加名字
     74             templist.append(name)
     75             # print(name)
     76             #添加评分
     77             score=li_soup.find('span',class_='v_score').text
     78             #处理评分
     79             score=score[-4:-1]
     80             templist.append(score)
     81             # print(score)
     82             #添加path
     83             path=li_soup.find('a',target="_blank")['href']
     84             templist.append(path)
     85             # print(path)
     86             #添加播放状态
     87             state="VIP"
     88             templist.append(state)
     89             print(templist)
     90             dataRes.append(templist)
     91             templist=[]
     92         print("-------------------------------------------")
     93     # print(len(dataRes))
     94 
     95     # #最新上架
     96     #
     97     # templist = []
     98     # for i in range(1,31):
     99     #     url_1='https://film.sohu.com/list_0_0_0_2_1_'
    100     #     auto=str(i)
    101     #     url_2='_60.html?channeled=1200100000'
    102     #     url=url_1+auto+url_2
    103     #     response = requests.get(url, headers)
    104     #     response.encoding = 'utf-8'
    105     #     page_text = response.text
    106     #     # etree_ = etree.HTML(page_text)
    107     #     # 获取所有的li
    108     #     soup = BeautifulSoup(page_text, 'lxml')
    109     #     # 标签层级选择
    110     #     li_list = soup.select('.movie-list>li')
    111     #     print(len(li_list))
    112     #     if(len(li_list)==0):
    113     #         print("最新上架爬取结束!")
    114     #         if(len(dataRes)!=0):
    115     #             return dataRes
    116     #     for li in li_list:
    117     #         li_text=str(li)
    118     #         # print(li_text)
    119     #         li_soup=BeautifulSoup(li_text,'lxml')
    120     #         name=li_soup.find('div',class_="v_name_info").text
    121     #         #添加名字
    122     #         templist.append(name)
    123     #         # print(name)
    124     #         #添加评分
    125     #         score=li_soup.find('span',class_='v_score').text
    126     #         #处理评分
    127     #         score=score[-4:-1]
    128     #         templist.append(score)
    129     #         # print(score)
    130     #         #添加path
    131     #         path=li_soup.find('a',target="_blank")['href']
    132     #         templist.append(path)
    133     #         # print(path)
    134     #         #添加播放状态
    135     #         state="VIP"
    136     #         templist.append(state)
    137     #         print(templist)
    138     #         dataRes.append(templist)
    139     #         templist=[]
    140     #     print("-------------------------------------------")
    141     # # print(len(dataRes))
    142     # #本周热播
    143     # templist = []
    144     # for i in range(1, 31):
    145     #     url_1 = 'https://film.sohu.com/list_0_0_0_2_0_'
    146     #     auto = str(i)
    147     #     url_2 = '_60.html?channeled=1200100000'
    148     #     url = url_1 + auto + url_2
    149     #     response = requests.get(url, headers)
    150     #     response.encoding = 'utf-8'
    151     #     page_text = response.text
    152     #     # etree_ = etree.HTML(page_text)
    153     #     # 获取所有的li
    154     #     soup = BeautifulSoup(page_text, 'lxml')
    155     #     # 标签层级选择
    156     #     li_list = soup.select('.movie-list>li')
    157     #     print(len(li_list))
    158     #     if (len(li_list) == 0):
    159     #         print("本周热播爬取结束!")
    160     #         if (len(dataRes) != 0):
    161     #             return dataRes
    162     #     for li in li_list:
    163     #         li_text = str(li)
    164     #         # print(li_text)
    165     #         li_soup = BeautifulSoup(li_text, 'lxml')
    166     #         name = li_soup.find('div', class_="v_name_info").text
    167     #         # 添加名字
    168     #         templist.append(name)
    169     #         # print(name)
    170     #         # 添加评分
    171     #         score = li_soup.find('span', class_='v_score').text
    172     #         # 处理评分
    173     #         score = score[-4:-1]
    174     #         templist.append(score)
    175     #         # print(score)
    176     #         # 添加path
    177     #         path = li_soup.find('a', target="_blank")['href']
    178     #         templist.append(path)
    179     #         # print(path)
    180     #         # 添加播放状态
    181     #         state = "VIP"
    182     #         templist.append(state)
    183     #         print(templist)
    184     #         dataRes.append(templist)
    185     #         templist = []
    186     #     print("-------------------------------------------")
    187     # print(len(dataRes))
    188     #list去重
    189     # old_list = dataRes
    190     # new_list = []
    191     # for i in old_list:
    192     #     if i not in new_list:
    193     #         new_list.append(i)
    194     # print(new_list)  # [2, 3, 4, 5, 1]
    195     return dataRes
    196 #插入数据库
    197 def insert_souhu():
    198     cursor = None
    199     conn = None
    200     try:
    201         count=0
    202         list = get_souhu()
    203         print(f"{time.asctime()}开始插入搜狐电影数据")
    204         conn, cursor = get_conn()
    205         sql = "insert into moviesohu (id,name,score,path,state) values(%s,%s,%s,%s,%s)"
    206         for item in list:
    207             print(item)
    208             count = count + 1
    209             #异常捕获,防止数据库主键冲突
    210             try:
    211                 cursor.execute(sql, [0, item[0], item[1], item[2], item[3] ])
    212             except pymysql.err.IntegrityError:
    213                 print("重复!跳过!")
    214         conn.commit()  # 提交事务 update delete insert操作
    215         print(f"{time.asctime()}插入搜狐电影数据完毕")
    216     except:
    217         traceback.print_exc()
    218     finally:
    219         close_conn(conn, cursor)
    220     return;
    221 
    222 if __name__ == '__main__':
    223     # get_iqy()
    224     # get_souhu()
    225     insert_souhu()

    运行截图

    数据库截图

     

    建表语句

    1 CREATE TABLE `moviesohu` (
    2   `id` INT(11) NOT NULL AUTO_INCREMENT,
    3   `name` VARCHAR(45) COLLATE utf8_bin NOT NULL,
    4   `score` VARCHAR(45) COLLATE utf8_bin NOT NULL,
    5   `path` VARCHAR(100) COLLATE utf8_bin NOT NULL,
    6   `state` VARCHAR(10) COLLATE utf8_bin NOT NULL,
    7   PRIMARY KEY (`name`),
    8   KEY `id` (`id`)
    9 ) ENGINE=INNODB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
  • 相关阅读:
    AtCoder Beginner Contest 169
    Codeforces Round #646 (Div. 2)
    Educational Codeforces Round 88 (Rated for Div. 2)
    Codeforces Round #645 (Div. 2)
    【uoj】【美团杯2020】平行四边形(原根)
    【uoj】【美团杯2020】半前缀计数(后缀自动机)
    Codeforces Round #644 (Div. 3)
    [COI2009] OTOCI
    [AHOI2005] 航线规划
    [P1390] 公约数的和
  • 原文地址:https://www.cnblogs.com/rainbow-1/p/14772320.html
Copyright © 2011-2022 走看看