zoukankan      html  css  js  c++  java
  • Python3.6爬虫+Djiago2.0+Mysql --数据爬取

    1.下载对应版本的python mysql 模块 我的是:pymssql-2.2.0.dev0-cp36-cp36m-win_amd64.whl

    2.手动创建table  

    create table grilsbase
    (
    id int primary key auto_increment,
    name varchar(50),height varchar(50),bwh varchar(50),title varchar(100),img_upload varchar(100),pc_img_upload varchar(100),
    resource_id varchar(50),totals varchar(50),recommend_id varchar(50),date varchar(50),headimg_upload varchar(50),
    show_datetime varchar(50),client_show_datetime varchar(50),video_duration varchar(50),free_select varchar(50),
    trial_time varchar(50),viewtimes varchar(50),coop_customselect_654 varchar(50),coop_id varchar(50),tag_class varchar(50),
    tag_name varchar(50),playerid varchar(50),block_detailid varchar(50),type varchar(50),istop varchar(50)
    )

    3.实现爬虫代码 

       导入模块:requests ,os,json,re,Mysqldb

       流程:获取数据=>分析数据=>解析数据=>持久化保存

      1 #coding:utf-8
      2 import  requests
      3 import os
      4 import json
      5 import re
      6 import MySQLdb
      7 import threading
      8 #获取数据url
      9 gilsUrl='http://act.vip.xunlei.com/ugirls/js/ugirlsdata.js'
     10 gilsDetailUrl='http://meitu.xunlei.com/detail.html'
     11 gilsImgUrl='http://data.meitu.xunlei.com/data/image/%s/%s'
     12 executor = threading.BoundedSemaphore(10)
     13 regex=re.compile('/([^/]*?.jpg)$')
     14 regexhead=re.compile('/([^/]*?).jpg$')
     15 class MySQL:
     16     def __init__(self,host,user,pwd,db):
     17         self.host=host
     18         self.user=user
     19         self.db=db
     20         self.pwd=pwd
     21     def GetConnect(self):
     22         if not self.db:
     23             raise(NameError,'没有目标数据库')
     24         self.connect=MySQLdb.connect(host=self.host,user=self.user,password=self.pwd,database=self.db,port=3306,charset='utf8')
     25         cur=self.connect.cursor()
     26         if not cur:
     27             raise(NameError,'数据库访问失败')
     28         else:
     29             return cur
     30     def ExecSql(self,sql):
     31          cur=self.GetConnect()
     32          cur.execute(sql)
     33          self.connect.commit()
     34          self.connect.close()
     35     def ExecQuery(self,sql):
     36         cur=self.GetConnect()
     37         cur.execute(sql)
     38         resList = cur.fetchall()
     39         self.connect.close()
     40         return resList   
     41 
     42 def getGirlsData():
     43     regex=re.compile("var ugirlsData=(.+)")
     44     r=requests.get(gilsUrl)
     45     jsond=regex.findall(r.text)
     46     with open('ugirlsdata.json','w+',encoding='utf-8') as f:
     47          f.write(jsond[0])
     48     #print('写入json成功')
     49     return json.loads(jsond[0])
     50 
     51 
     52 def getImgName(imgurl):
     53     if(imgurl==''):
     54         return ''
     55     m=regex.findall(imgurl)
     56     if m is None:
     57         return ''
     58     else:
     59         return  m[0] if len(m)>0 else ''
     60 
     61 def getImgNameHead(imgurl):
     62     if(imgurl==''):
     63         return ''
     64     m=regexhead.findall(imgurl)
     65     if m is None:
     66         return ''
     67     else:
     68         return  m[0] if len(m)>0 else ''
     69 
     70 def WriteDB(jsdata):
     71     ms = MySQL(host="192.168.0.108", user="lin", pwd="12345678", db="grils")
     72     for data in jsdata:
     73         sql="insert into grilsbase(
     74         name,height,bwh,title,img_upload,pc_img_upload,resource_id,totals,recommend_id,
     75         date,headimg_upload,show_datetime,client_show_datetime,video_duration,free_select,trial_time,
     76         viewtimes,coop_customselect_654,coop_id,tag_class,tag_name,playerid,block_detailid,type,istop)
     77         values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % 
     78         (data['name'],data['height'],data['bwh'],data['title'],getImgName(data.get('img_upload','')),data['pc_img_upload'],data['resource_id'],data["totals"],data["recommend_id"], 
     79         data['date'],getImgName(data.get("headimg_upload",'')),data["show_datetime"],data["client_show_datetime"],data["video_duration"],data["free_select"],data["trial_time"], 
     80         data['viewtimes'],data['coop_customselect_654'],data['coop_id'],data.get('tag_class',''),data.get('tag_name',''),data.get('playerid',''),data['block_detailid'],data['type'],data['istop'])
     81         #print(sql)
     82         ms.ExecSql(sql)
     83         print('完成'+data['name']+'数据更新...')
     84         DownImg(data['name'],data["totals"],data['resource_id'],data["headimg_upload"],data["img_upload"])
     85         
     86 
     87 
     88 def DownImg(name,totals,resource_id,headimg_upload,img_upload):
     89     path=creatFile(resource_id)
     90     if headimg_upload.strip()!='':
     91         #os.remove('./pic/'+resource_id+'/'+getImgName(headimg_upload)+'.jpg')
     92         DownImgRun(headimg_upload,path,getImgNameHead(headimg_upload))
     93     if img_upload.strip()!='':
     94         #os.remove('./pic/'+resource_id+'/'+getImgName(img_upload)+'.jpg')
     95         DownImgRun(img_upload,path,getImgNameHead(img_upload))
     96     #print('正在下载'+name+'图片')
     97     
     98     for i in range(1,int(totals)+1):
     99         url=gilsImgUrl%(resource_id,str(i)+'.jpg')
    100         DownImgRun(url,path,i)
    101         #t=threading.Thread(target=DownImgRun,args={url,path,i})
    102         #t.start()
    103         #t.join()
    104     
    105         
    106 
    107 def DownImgRun(url,path,i):
    108     #print(url)
    109     
    110     r=requests.get(url)
    111     if(r.status_code==200):
    112         with open(path+'/'+str(i)+'.jpg','wb') as fimg:
    113             fimg.write(r.content)
    114             
    115 
    116 
    117 def creatFile(dirname):
    118     path='./pic/'+dirname
    119     if os.path.exists(path):
    120         return path
    121     else:
    122         os.makedirs(path)
    123         return  path
    124 
    125 
    126 if __name__ == '__main__':
    127     gri=getGirlsData()
    128     WriteDB(gri)

    4.运行效果 和结果

     

  • 相关阅读:
    用webclient.DownloadFile下载exe文件时大小为0
    C# ,asp.net 获取当前,相对,绝对路径(转)
    c#读取进程列表判断程序是否已经启动(转)
    如何提升页面渲染效率
    前端知识点总结——VUE
    ie7ajax 跨域 no transport 解决办法
    酷炫网址
    框架学习官网
    JavaScript数组方法大全(推荐)
    rem是如何实现自适应布局的
  • 原文地址:https://www.cnblogs.com/linsu/p/8606916.html
Copyright © 2011-2022 走看看