zoukankan      html  css  js  c++  java
  • Python爬虫爬取豆瓣电影名称和链接,分别存入txt,excel和数据库

    前提条件是python操作excel和数据库的环境配置是完整的,这个需要在python中安装导入相关依赖包;

    实现的具体代码如下:

    #!/usr/bin/python
    # -*- coding: utf-8 -*-

    import urllib
    import urllib2
    import sys
    import re
    import ssl
    import openpyxl
    import MySQLdb
    import time

    #from bs4 import BeautifulSoup

    #修改系统默认编码为utf-8
    reload(sys)
    sys.setdefaultencoding("utf-8")
    ssl._create_default_https_context = ssl._create_unverified_context

    #创建全局列表存储数据,存放电影名字和链接地址
    nameLists = []
    linkLists = []

    #搜索豆瓣top100电影,保存成文件、excel、数据库
    class TopMove:
    #初始化
    def __init__(self):
    #self.page = page
    #self.nameList = []
    #self.linkList = []
    self.URL = 'https://movie.douban.com/top250?start='

    def GetHTML(self,page):
    #for page in rang(10):
    try:
    url = self.URL + str(page * 25)
    request = urllib2.Request(url)
    response = urllib2.urlopen(request)
    html = response.read().decode('utf-8')
    #print html
    return html
    #链接报错的原因
    except urllib2.URLError, e:
    if hasattr(e,"reason"):
    print u'链接豆瓣电影失败,错误原因:',e.reason
    return None

    def GetList(self):
    #nameLists = []
    #linkLists = []
    for page in range(10):
    print "正在获取电影列表" + str(page+1)
    #print str(page)
    html = self.GetHTML(page)
    #因为title的电影名有些存在两个title标签,所以就在img中去正则匹配
    name = re.compile('<img alt="(.*?)".*?>',re.S)
    link = re.compile('<div class="hd">.*?<a.*?href="(.*?)".*?>.*?</a>',re.S)

    nameList = re.findall(name,html)
    linkList = re.findall(link,html)
    for name in nameList:
    #剔除英文名包含“/”
    if name.find('/') == -1:
    nameLists.append(name)
    for link in linkList:
    linkLists.append(link)
    #nameLists.append(nameList[0].strip())
    #linkLists.append(linkList[0].strip())
    #print nameList
    #print linkList
    print "获取完毕"
    return nameLists,linkLists

    #保存为文本文件
    def save_Text(self):
    #List = []
    #List = self.GetList(page)
    try:
    f = open('D:learndate.txt','a')

    for i in range(250):
    #循环写入名称和地址
    f.write(nameLists[i])
    f.write(' '*3)
    f.write(linkLists[i])
    f.write(' ')
    #关闭文件
    f.close()
    except Exception as e:
    print e

    print u"文件存储结束"

    #保存为excel格式
    def save_Excel(self):
    #List = []
    #List = self.GetList()

    try:
    #新建workbook
    wb = openpyxl.Workbook()
    #去工作表的sheet页
    sheet = wb.get_active_sheet()
    #sheet页命名
    sheet.title = 'Move Top 250'
    for i in range(1,251):
    one = 'a' + str(i) #a1,a列
    two = 'b' + str(i) #b2,b列
    sheet[one] = nameLists[i-1]
    sheet[two] = linkLists[i-1]
    #print nameLists[i-1]
    #print linkLists[i-1]
    #保存文件格式,文件名为中文

    wb.save(ur'D:/learn/豆瓣电影TOP250.xlsx')

    except Exception as e:
    print e
    print 'Excel 文件存储结束'

    #保存到数据库中本地
    def save_Mysql(self):
    #List = []
    #List = self.GetList()
    try:
    #链接数据库
    conn = MySQLdb.connect(
    host='localhost',
    port=3306,
    user='root',
    passwd='lebb123',
    db='pytest',
    charset='utf8'
    )
    #获取操作游标
    cursor = conn.cursor()
    print 'Connecting to MYSQL Success'
    #如果表存在就删除
    cursor.execute('Drop table if EXISTS MovieTop')
    time.sleep(3)
    #创建一个数据库表
    cursor.execute(
    """create table if not EXISTS MovieTop(
    id int(4) not null primary key auto_increment,
    movieName varchar(200),
    link varchar(200));"""
    )
    for i in range(250):
    #插入数据库数据sql
    sql = 'insert into MovieTop(movieName,link) VALUES (%s,%s)'
    param = (nameLists[i],linkLists[i])
    #print nameLists[i],linkLists[i]
    #执行SQL
    cursor.execute(sql,param)
    #提交到数据库执行
    conn.commit()
    cursor.close()
    conn.close()
    except Exception as e:
    print e
    print "Data Success Save in MYSQL"


    def Start(self):

    self.GetList()
    self.save_Text()
    self.save_Excel()
    #wb = self.save_Excel()
    self.save_Mysql()


    dytop = TopMove()
    dytop.Start()

  • 相关阅读:
    JQuery中serialize()、serializeArray()和param()方法示例介绍
    新的跨域策略:使用COOP、COEP为浏览器创建更安全的环境
    react的状态提升
    HTTP/0.9、HTTP/1.0、HTTP/1.1、HTTP/2、HTTP/3各版本之间的区别?
    面试常见的http问题
    什么是模块化?
    什么是Node js
    Asynchronous
    初识ajax
    浅拷贝与深拷贝
  • 原文地址:https://www.cnblogs.com/lebb1993/p/6104859.html
Copyright © 2011-2022 走看看