zoukankan      html  css  js  c++  java
  • 简单爬虫

    from pyquery import PyQuery as pq
    import urllib.request
    import pymysql
    import uuid
    
    conn = pymysql.connect(host='127.0.0.1', user="root", passwd="123456", db="test", port=3306, charset="utf8")
    cur = conn.cursor()
    cur.execute("select * from user")
    #获取数据
    users = cur.fetchall()
    for i in range(len(users)):
        print(users[i])
    
    #获取原码
    def get_content(page):
        url ='https://saudi.souq.com/sa-en/mobile-phone-accessories/l/?rpp=32&_=1550499488459&sortby=sr&section=2&page='+ str(page)
        a = urllib.request.urlopen(url)#打开网址
        html = a.read().decode('utf-8')#读取源代码并转为unicode
        return html
    
    def get(html):
        doc = pq(html)
        items = doc('.img-link.quickViewAction.sPrimaryLink')
        return items
    
    #多页处理,下载到文件
    for  j in range(1,3000):
        print("正在爬取第"+str(j)+"页数据...")
        html = get_content(j) #调用获取网页原码
        #for i in get(html):
        for i in get(html):
            prodouct_link = pq(i).attr('href')
            # 防止有的页面,请求没反应,程序停止
            try:
                doc = pq(url=prodouct_link)
                title = doc('.product-title>h1').text()
                price = doc('.price.is.sk-clr1').text()
                stock = doc('.txtcolor-alert.xleft>span').text()
                color = doc('span.connection.title').text()
                shop_name = doc('.unit-seller-link>a>b').text()
                sales = doc('.show-for-medium.bold-text').text()
                image = doc('.img-bucket>img').attr("src")
                prodouct_id = str(uuid.uuid1())
    
                sql = "insert into shop (product_id, product_name,product_link,product_seller,product_price,product_sales,product_stock,product_image) values (%s, %s, %s, %s,%s, %s, %s, %s)"
                try:
                    count =cur.execute(sql, [prodouct_id, title, prodouct_link, shop_name, price, sales, stock, image])
                    # 判断是否成功
                    if count > 0:
                      print("添加数据成功!
    ")
                     # 提交事务
                    conn.commit()
                except:
                    pass
            except:
                pass
            with open('job.txt', 'a', encoding='utf-8') as f:
                f.write(prodouct_link+'
    ')
                f.close()
    
    #关闭数据库资源连接
    cur.close()
    conn.close()
  • 相关阅读:
    Jquery
    JavaScript
    poj--2115 C Looooops
    poj--3970 party
    poj 1061 青蛙的约会
    hdu1250--Hat's Fibonacci
    2318--TOYS
    扩展欧几里得--让你一次刷个够
    关于大数加法的解法
    有关环形数组的约瑟夫问题
  • 原文地址:https://www.cnblogs.com/xmyfsj/p/15231931.html
Copyright © 2011-2022 走看看