zoukankan      html  css  js  c++  java
  • python商品分类信息

      采集商品分类信息

    from selenium.webdriver.common.action_chains import ActionChains
    from pyquery import PyQuery as pq
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    import time
    # 抓取分类数据
    def tianmao_catch_category():
        driver = get_driver('', False)
        try:
            url = 'https://www.tmall.com/?ali_trackid=2:mm_26632258_3504122_55934697:1609295236_235_1586302010&union_lens=recoveryid:1609295236_235_1586302010&clk1=3a059b6fd5d21a5e9086e711fdf3afe4&bxsign=tbkJxFfRkMJdwE3OwpP483v2+4G1PrzCDIDumBW7tv5QzQfc+xlm3i2oiRMn2bJl4qaPrxH6ekD1p3hgS1sBUJbM4REq9LyuFhLBITi5yXSBSs='
            driver.get(url)
    
            time.sleep(10)
            # spans = doc("div[id='imgid']").find("div[class='imgpage']").find(
            #     "ul[class^='imglist clearfix pageNum']").find("li[class='imgitem']")
            # spans = doc("ul[class='normal-nav clearfix']").find("li[class^='j_MenuNav nav-item nav-item-']")
            spans=driver.find_elements_by_xpath("//ul[@class='normal-nav clearfix']/li")
            isbreak = False
            count1 = 0
            count2 = 0
            list1 = []
    
            for span in spans:
                # 鼠标事件
                ActionChains(driver).move_to_element(span).perform()
                data_title = str(span.text).replace(' /', '/').strip().replace('','')
                # 删除类似ue615  字符
                ts = data_title.split(' ')
                if len(ts)==1:
                    list1.append(ts[0])
                elif len(ts)==2:
                    list1.append(ts[1])
                time.sleep(3)
    
            selenium_html = driver.execute_script("return document.documentElement.outerHTML")
            doc = pq(selenium_html)
            sub_spans = doc("div[class='content-con j_categoryContent']").find(
                "div[class='pannel-con j_CategoryMenuPannel']").find("div[class^='pannel-']")
            print('
    ')
            index = 0
            netname = '天猫'
            for sp in sub_spans.items():
                category_one = list1[index]
                index += 1
                two_item = sp.find("div[class='hot-word-con']").find("div[class='hot-word-line']")
                for ts in two_item.items():
                    category_two = ts.find("div[class='line-title']").find("div[class='title-text']").text()
                    sps = ts.find("div[class='line-con']").find("a[class^='hot-word']")
                    for sp in sps.items():
                        category_three = sp.text()
                        print(category_one, category_two, category_three)
                        db.saveCategory(netname, category_one, category_two, category_three)
                print('
    ')
        except Exception as ex:
            print(ex)
        driver.quit()
    
    # 抓取分类数据
    def jingdong_catch_category():
        driver = get_driver('', False)
        # proxy_one = ip_read()
        # driver = get_driver(proxy_one, False)
        try:
    
            url = 'https://www.jd.com/?cu=true&utm_source=baidu-pinzhuan&utm_medium=cpc&utm_campaign=t_288551095_baidupinzhuan&utm_term=0f3d30c8dba7459bb52f2eb5eba8ac7d_0_48ba7a220ee5462c97fc2d5f3691e5c5'
            driver.get(url)
            # selenium_html = driver.execute_script("return document.documentElement.outerHTML")
            # doc = pq(selenium_html)
            time.sleep(10)
            # spans = doc("div[id='imgid']").find("div[class='imgpage']").find(
            #     "ul[class^='imglist clearfix pageNum']").find("li[class='imgitem']")
            # spans = doc("ul[class='normal-nav clearfix']").find("li[class^='j_MenuNav nav-item nav-item-']")
            spans = driver.find_elements_by_xpath("//ul[@class='JS_navCtn cate_menu']/li[@class='cate_menu_item']")
    
            list1 = []
    
            for span in spans:
                ActionChains(driver).move_to_element(span).perform()
                data_title = str(span.text).replace('/ ', '/').replace(' /', '/').strip().replace('', '')
    
                print('data_title=',data_title)
    
                list1.append(data_title)
                time.sleep(3)
    
            selenium_html = driver.execute_script("return document.documentElement.outerHTML")
            doc = pq(selenium_html)
            sub_spans = doc("div[id='J_popCtn']").find("div[class='cate_part clearfix']")
            print('
    ')
            index = 0
            netname = '京东'
    
            for sp in sub_spans.items():
                category_one = list1[index]
    
                two_item = sp.find("div[class='cate_part_col1']").find("div[class='cate_channel']").find("a[class='cate_channel_lk']")
                index1 = 0
                category_two=''
                for ts in two_item.items():
    
                    category_three=''
                    if index1==0:
                        category_two = str(ts.text())
                    else:
                        category_three= str(ts.text())
                        print(category_one, category_two, category_three)
                        db.saveCategory(netname, category_one, category_two, category_three)
                    index1+=1
    
    
                two_item = sp.find("div[class='cate_part_col1']").find("div[class='cate_detail']").find(
                    "dl[class^='cate_detail_item cate_detail_item']")
                index1 = 0
                category_two = ''
                for ts in two_item.items():
    
                    category_three = ''
                    if index1 == 0:
                        category_two = str(ts.find("dt[class='cate_detail_tit']").find("a[class='cate_detail_tit_lk']").text())
                    else:
                        sps = ts.find("dd[class='cate_detail_con']").find("a[class='cate_detail_con_lk']")
                        for sp in sps.items():
                            category_three = str(sp.text())
                            print(category_one, category_two, category_three)
                            db.saveCategory(netname, category_one, category_two, category_three)
                    index1 += 1
                index += 1
                print('
    ')
            print(index)
        except Exception as ex:
            print(ex)
    
        driver.quit()

      

  • 相关阅读:
    Linux-Deepin下搭建云笔记
    MySQL-脏页的刷新机制
    网络编程-Netty-writeAndFlush方法原理分析 以及 close以后是否还能写入数据?
    MySQL-SQL调优-引擎选错索引或者不使用索引分析 和 字符串加索引的方式思考
    字体的一些属性
    css清除浮动大全,共8种方法
    WEB前端开发人员须知的常见浏览器兼容问题及解决技巧
    区别各种IE浏览器和火狐的css写法
    IE和火狐浏览器对css解析的不一致
    使用photowap插件
  • 原文地址:https://www.cnblogs.com/shaosks/p/14214849.html
Copyright © 2011-2022 走看看