zoukankan html css js c++ java
爬取京东数据

#模仿浏览器
#下载地址
#创建数据库
#初始化downloadSpider文件夹
#访问京东页面
#数据库增删改查
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
import threading
import datetime
import sqlite3
import urllib
import time
import os
class MySipider:
    #模拟浏览器
    herder = {
    "user_Agent":"Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00"
        }
    #下载地址
    imagePath = "downloadSpider"
    def StartUp(self,url,key):
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(options=options)
        self.driver.maximize_window()
        self.thread=[]
        self.No=0
        self.imgNo=0
        #创建数据表
        try:
            self.con=sqlite3.connect("phones.db")
            #self.con=sql.conect("phones.db")
            self.cursor=self.con.cursor()
            try:
               self.cursor.execute("Drop table phone")
            except :
                pass
        except :
            pass
        try:
            sql="create table phones(mNO varchar(32) primary key,mMark varchar(126),mNote varchar(126),mPrice varchar(126),mFile varchar(126))"
            self.cursor.execute(sql)
        except :
            pass
        #初始化dowmload文件夹
        try:
            if not os.path.exists(MySipider.imagePath):
                os.mkdir(MySipider.imagePath)
            images=os.listdir(MySipider.imagePath)
            for img in images:
                s=os.path.join(MySipider.imagePath,img)
                os.remove(s)
        except Exception as err:
            print(err)
        #访问第一页
        self.driver.get(url)
        keyInput=self.driver.find_element_by_id('key')
        keyInput.send_keys(key)
        keyInput.send_keys(key)
       
    def CloseUp(self):
        try:
            self.con.commit()
            self.con.close()
            self.driver.close()
        except Exception as err:
            print(err)
    def InsertDB(self,mNO,mMark,mPrice,mNote,mFile):
        try:
            sql="insert into phones(mNO,mMark,mPrice,mNote,mFile)values(?,?,?,?)"
            self.cursor.execute(sql,(mNO,mMark,mPrice,mFile))
        except :
            pass
    def ShowDB(self):
        try:
            con=sqlite3.connect("phones.db")
            cursor=con.cursor()
            print(row[0],row[1],row[2],row[3])
            cursor.execute("select mNO,mMark,mPrice,mFile from phone order by mNo")
            rows=cursor.fetchall()
            for row in rows:
                print(row[0],row[1],row[2],row[3])
            con.close()
        except :
            pass
    def downloadSpider(self,src1,src2,mFile):
        data=None
        if src1:
            try:
                req=urllib.request.Request(src1,header=MySipider.herder)
                resp=urllib.request.urlopen(req,timeout=400)
                data=resp.read()
            except :
                pass
        if not data and src2:
            try:
                req=urllib.request.Request(src2,header=MySipider.herder)
                resp=urllib.request.urlopen(req,timeout=400)
                data=resp.read()
            except :
                pass
        if data:
            fobj=open(MySipider.imagePath+"\"+mFile,"wb")
            fobj.write(data)
            fobj.close()
            print("downlaod",mFile)
    def ProcessSpider(self):
        try:
            time.sleep(1)
            print(self.driver.current_url)
            lis = self.driver.find_element_by_xpath("//div[@id='J_goodsList']//li[@class='gl_item']")
            for li in lis:
                try:
                    scr1=li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src")
                except :
                    src1=""
                try:
                    scr2=li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src")
                except :
                    src2=""
                try:
                    price = li.find_element_by_xpath(".//div[@class='p-price']//i").text
                except :
                    price="0"
                if src1:
                    src1=urllib.request.urljoin(self.driver.current_url,scr1)
                    p=src1.rfind(".")
                    mFile=no+src1[p:]
                else:
                    src2=urllib.request.urljoin(self.driver.current_url,scr2)
                    p=src2.rfind(".")
                    mFile=no+src1[p:]
                if src1 or src2:
                    T=threading.Thread(target=self.downloadSpider,args=(src1,src2,mFile))
                    T.setDaemon(False)
                    T.start()
                    self.thread.append(T)
                else:
                    mFile=""
                self.InsertDB(no,mark,price,mFile)
            try:
                self.driver.find_element_by_xpath("//pan[@class='p-num']//a[@class='pn-next disabled']")
            except :
                nextpage=self.driver.find_element_by_xpath("//pan[@class='p-num']//a[@class='pn-next']")
                nextpage.click()
                self.ProcessSpider()
        except :
            pass

    def ExecuteSpider(self,url,key):
        starttime=datetime.datetime.now()
        self.StartUp(url,key)
        self.ProcessSpider()
        self.CloseUp()
        for t in self.thread:
            t.join()
url="http://www.jd.com"
spider=MySipider()
while True:
    print("1,爬取")
    print("2,显示")
    print("3,退出")
    s=input("请输入选择")
    if s=="1":
        spider.ExecuteSpider(url,"手机")
    elif s=="2":
        spider.ShowDB()
    elif s=="3":
        break


        #lis = self.driver.find_element_by_xpath("//div[@id='J_goodsList']//li[@class='gl_item']")
        #for li in lis:
        #    try:
        #        price = li.find_element_by_xpath(".//div[@class='p-price']//i").text
        #    except :
        #        price="0"
        #    try:
        #        src=li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src")
        #    except :
        #        src=""
        #try:
        #    self.driver.find_element_by_xpath("//pan[@class='p-num']//a[@class='pn-next disabled']")
        #except :
        #    nextpage=self.driver.find_element_by_xpath("//pan[@class='p-num']//a[@class='pn-next']")
        #    nextpage.click()
查看全文
相关阅读:
捡到一本<C++ Reference>
题目1008：最短路径问题
 题目1014：排名
 题目1080：进制转换
 题目1081：递推数列
 题目1086：最小花费
 题目1076：N的阶乘
 题目1035：找出直系亲属
 在Mac上搭建Jenkins环境
 获取鼠标点击UGUI，先对于特定物体的相对坐标
原文地址：https://www.cnblogs.com/jestin/p/12911256.html