zoukankan      html  css  js  c++  java
  • 爬取京东数据

    #模仿浏览器
    #下载地址
    #创建数据库
    #初始化downloadSpider文件夹
    #访问京东页面
    #数据库增删改查
    from selenium.webdriver.common.keys import Keys
    from selenium import webdriver
    import threading
    import datetime
    import sqlite3
    import urllib
    import time
    import os
    class MySipider:
        #模拟浏览器
        herder = {
        "user_Agent":"Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00"
            }
        #下载地址
        imagePath = "downloadSpider"
        def StartUp(self,url,key):
            options = webdriver.ChromeOptions()
            options.add_argument('--headless')
            options.add_argument('--disable-gpu')
            self.driver = webdriver.Chrome(options=options)
            self.driver.maximize_window()
            self.thread=[]
            self.No=0
            self.imgNo=0
            #创建数据表
            try:
                self.con=sqlite3.connect("phones.db")
                #self.con=sql.conect("phones.db")
                self.cursor=self.con.cursor()
                try:
                   self.cursor.execute("Drop table phone")
                except :
                    pass
            except :
                pass
            try:
                sql="create table phones(mNO varchar(32) primary key,mMark varchar(126),mNote varchar(126),mPrice varchar(126),mFile varchar(126))"
                self.cursor.execute(sql)
            except :
                pass
            #初始化dowmload文件夹
            try:
                if not os.path.exists(MySipider.imagePath):
                    os.mkdir(MySipider.imagePath)
                images=os.listdir(MySipider.imagePath)
                for img in images:
                    s=os.path.join(MySipider.imagePath,img)
                    os.remove(s)
            except Exception as err:
                print(err)
            #访问第一页
            self.driver.get(url)
            keyInput=self.driver.find_element_by_id('key')
            keyInput.send_keys(key)
            keyInput.send_keys(key)
           
        def CloseUp(self):
            try:
                self.con.commit()
                self.con.close()
                self.driver.close()
            except Exception as err:
                print(err)
        def InsertDB(self,mNO,mMark,mPrice,mNote,mFile):
            try:
                sql="insert into phones(mNO,mMark,mPrice,mNote,mFile)values(?,?,?,?)"
                self.cursor.execute(sql,(mNO,mMark,mPrice,mFile))
            except :
                pass
        def ShowDB(self):
            try:
                con=sqlite3.connect("phones.db")
                cursor=con.cursor()
                print(row[0],row[1],row[2],row[3])
                cursor.execute("select mNO,mMark,mPrice,mFile from phone order by mNo")
                rows=cursor.fetchall()
                for row in rows:
                    print(row[0],row[1],row[2],row[3])
                con.close()
            except :
                pass
        def downloadSpider(self,src1,src2,mFile):
            data=None
            if src1:
                try:
                    req=urllib.request.Request(src1,header=MySipider.herder)
                    resp=urllib.request.urlopen(req,timeout=400)
                    data=resp.read()
                except :
                    pass
            if not data and src2:
                try:
                    req=urllib.request.Request(src2,header=MySipider.herder)
                    resp=urllib.request.urlopen(req,timeout=400)
                    data=resp.read()
                except :
                    pass
            if data:
                fobj=open(MySipider.imagePath+"\"+mFile,"wb")
                fobj.write(data)
                fobj.close()
                print("downlaod",mFile)
        def ProcessSpider(self):
            try:
                time.sleep(1)
                print(self.driver.current_url)
                lis = self.driver.find_element_by_xpath("//div[@id='J_goodsList']//li[@class='gl_item']")
                for li in lis:
                    try:
                        scr1=li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src")
                    except :
                        src1=""
                    try:
                        scr2=li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src")
                    except :
                        src2=""
                    try:
                        price = li.find_element_by_xpath(".//div[@class='p-price']//i").text
                    except :
                        price="0"
                    if src1:
                        src1=urllib.request.urljoin(self.driver.current_url,scr1)
                        p=src1.rfind(".")
                        mFile=no+src1[p:]
                    else:
                        src2=urllib.request.urljoin(self.driver.current_url,scr2)
                        p=src2.rfind(".")
                        mFile=no+src1[p:]
                    if src1 or src2:
                        T=threading.Thread(target=self.downloadSpider,args=(src1,src2,mFile))
                        T.setDaemon(False)
                        T.start()
                        self.thread.append(T)
                    else:
                        mFile=""
                    self.InsertDB(no,mark,price,mFile)
                try:
                    self.driver.find_element_by_xpath("//pan[@class='p-num']//a[@class='pn-next disabled']")
                except :
                    nextpage=self.driver.find_element_by_xpath("//pan[@class='p-num']//a[@class='pn-next']")
                    nextpage.click()
                    self.ProcessSpider()
            except :
                pass
    
        def ExecuteSpider(self,url,key):
            starttime=datetime.datetime.now()
            self.StartUp(url,key)
            self.ProcessSpider()
            self.CloseUp()
            for t in self.thread:
                t.join()
    url="http://www.jd.com"
    spider=MySipider()
    while True:
        print("1,爬取")
        print("2,显示")
        print("3,退出")
        s=input("请输入选择")
        if s=="1":
            spider.ExecuteSpider(url,"手机")
        elif s=="2":
            spider.ShowDB()
        elif s=="3":
            break
    
    
            #lis = self.driver.find_element_by_xpath("//div[@id='J_goodsList']//li[@class='gl_item']")
            #for li in lis:
            #    try:
            #        price = li.find_element_by_xpath(".//div[@class='p-price']//i").text
            #    except :
            #        price="0"
            #    try:
            #        src=li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src")
            #    except :
            #        src=""
            #try:
            #    self.driver.find_element_by_xpath("//pan[@class='p-num']//a[@class='pn-next disabled']")
            #except :
            #    nextpage=self.driver.find_element_by_xpath("//pan[@class='p-num']//a[@class='pn-next']")
            #    nextpage.click()
            
  • 相关阅读:
    如何实现序列化为json
    unity中camera摄像头控制详解
    eclipse配置c开发环境
    uml和模式01
    angular2开发01
    微信公众平台开发01
    最新无线网卡驱动安装
    交换ctrl和caps_loack的新方法
    web.xml文件详解
    设计模式中的里氏代换原则
  • 原文地址:https://www.cnblogs.com/jestin/p/12911256.html
Copyright © 2011-2022 走看看