zoukankan      html  css  js  c++  java
  • python推荐淘宝物美价廉商品 2.0

    改动:

    新增功能 :可选择只看天猫或淘宝

    代码模块化封装,参数配置或输入单独在一个py文件管理,主函数功能只留出参数传入在setting配置的py文件里。

    main.py代码:

      1 # -*- coding: utf-8 -*-
      2 import urllib
      3 import urllib2
      4 import requests
      5 
      6 import re
      7 import time 
      8 import random
      9 import os
     10 from math import log
     11 from math import log10
     12 from math import sqrt
     13 import sys
     14 
     15 import setting
     16 
     17 
     18 #"pageSize":44,"totalPage":100,"currentPage":3,"totalCount":29561
     19 
     20 '''在Python自己IDE上要注释掉以下两行'''
     21 
     22 reload(sys)  
     23 sys.setdefaultencoding('utf8')  # python2.x的的defaultencoding是ascii
     24 
     25 class counter(object):
     26     #计数器
     27     def __init__(self):
     28         self.count  = 0
     29         self.try_time = 0
     30 
     31         self.try_find = 0
     32         self.fail_time = 0
     33         self.url_list = []
     34         self.new_flag = True
     35         self.results = []
     36         self.priSu = 0
     37         self.descSu = 0
     38         self.tm_tb = ''
     39 
     40     def print_counter(self):
     41         print 'try_time:', self.try_time,   "  get_count:" , self.count,   "  fail_time:",self.fail_time ,"try_find_time:",self.try_find
     42 
     43 
     44  
     45 
     46 def recommend_rate(price, description, delivery, service, comments):
     47     #描述为绝对值
     48     av_p = counter1.priSu / counter1.count
     49     av_d = counter1.descSu / counter1.count
     50     rate = (description/av_d)**20  
     51             *(description + delivery + service) 
     52             *(av_p/(price))**0.1 
     53             +log((comments+5),1000)
     54 
     55     #print 'all count=',counter1.count
     56     #print "avrage price=",av_p,';',av_p/(price),';price',price,';comments=',comments,';descrip=',description
     57     #print 'rate=',rate,'(price)yinzi',(av_p/(price))**0.1,'descrip_yinzi',(description/av_d)**20,'comments_factor=',log((comments+50),100)
     58     return rate
     59 
     60 
     61 def product_rank(list):
     62     for x in list:
     63         #0开始为  x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况3个、x8服务情况 
     64         rate = recommend_rate(x[3],x[7],x[6],x[8],x[4])
     65         x.append(rate)
     66 
     67 
     68 def get_user_rate(item_url):
     69     #暂时未使用该功能
     70     '''获取卖家信用情况;未登录情况不能访问,或者需要在头部文件中加入cookie。。。;'''
     71     html = urllib2.urlopen(item_url)
     72     #"//rate.taobao.com/user-rate-282f910f3b70f2128abd0ee9170e6428.htm"
     73     regrex_rate = '"(//.*?user-rate.*?)"'
     74     codes =  re.findall(regrex_rate,html.read())
     75     html.close()
     76 
     77     user_rate_url= 'http:'+codes[0]
     78     print 'uu', user_rate_url
     79 
     80     user_rate_html = urllib2.urlopen(user_rate_url)
     81     print user_rate_html.read()
     82     #title = "4.78589分"
     83     desc_regex = u'title="(4.[0-9]{5}).*?'
     84     de_pat = re.compile(desc_regex)
     85     
     86     descs = re.findall(de_pat,user_rate_html.read())
     87     print len(descs)
     88     item_url = 'https://item.taobao.com/item.htm?id=530635294653&ns=1&abbucket=0#detail'
     89 #get_user_rate(item_url)
     90 '''获取卖家信用情况;未登录情况不能访问。。。暂时 无用'''
     91 
     92 
     93 def makeNewdir(savePath):
     94     while os.path.exists(savePath):
     95         savePath = savePath + str(random.randrange(1,10))
     96         #print "the path exist,we'll make a new one"
     97     try:
     98         os.makedirs(savePath)
     99         print 'ok,file_path we reserve results:  %s'%savePath
    100         print '保存的路径为:'.decode('utf-8')
    101         
    102     except :
    103         print "failed to make file path
    please restart program"
    104         print '创建文件夹失败,请重新启动程序'.decode('utf-8')
    105     return savePath
    106         
    107 
    108 def get_praised_good(url, file_open, keyword, counts, descripHrequ, servHrequ, descripNrequ):
    109     #从给定的淘宝链接中 获取符合条件的商品list
    110     html = req_s.get(url)
    111     code = html.content
    112     html.close()
    113 
    114     regrex2 = ur'raw_title":"(.*?)","pic_url":"(.*?)","detail_url":"(.*?)","view_price":"(.*?)".*?"comment_count":"(.*?)".*?"nick":"(.*?)".*?"delivery":[(.*?),(.*?),(.*?)],"description":[(.*?),(.*?),(.*?)],"service":[(.*?),(.*?),(.*?)]' 
    115     #每一个匹配项 返回  15个 字符串 
    116     #x[0]开始为  x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况3个、x9描述相符情况3个、x12服务情况3个
    117     pat = re.compile(regrex2)
    118     meet_code = re.findall(regrex2, code)#
    119     if not len(meet_code):
    120         counter1.new_flag = False
    121         print 'no more new met products'
    122 
    123     for x in meet_code:
    124         # if counter1.count>=counts :
    125         #   print "have get enough pruducts"
    126         #   break
    127         counter1.try_find += 1
    128 
    129         description_higher = int(x[10])*float(x[11])/100
    130         service_higher = int(x[13])*float(x[14])/100
    131         try:
    132             x4 = int(x[4]) #description_count
    133         except:
    134             x4 = 0
    135 
    136         #如果 只要淘宝 非天猫
    137         if counter1.tm_tb == 'taobao':
    138             if counter1.tm_tb not in x[2].split('.'):
    139                 break
    140 
    141         if  (description_higher>=descripHrequ) and (service_higher>=servHrequ) and x4>=descripNrequ:
    142             if re.findall(keyword,x[0]) : # 中文keyword在结果中匹配问题暂时没有解决,,直接加在搜索词里吧 
    143                 x0 = x[0].replace(' ','').replace('/','')
    144                 detail_url = 'http:' + x[2].decode('unicode-escape').encode('utf-8')
    145                 x1 = 'http:'+ x[1].decode('unicode-escape').encode('utf-8')
    146                 #print type(x)
    147                 if detail_url in counter1.url_list  or x4 == 0:
    148                     counter1.new_flag = False
    149                     print 'no more new met products'
    150                     print counter1.url_list
    151                     print detail_url
    152                     break
    153                 counter1.url_list.append(detail_url)
    154                 counter1.try_time += 1
    155                 counter1.count += 1
    156 
    157                 x11 = float(x[11])/100
    158                 x9 = float(x[9])/100
    159                 x12 = float(x[12])/100
    160                 x6 = float(x[6])/100
    161                 x3 = float(x[3])
    162                 counter1.priSu += x3
    163                 counter1.descSu += x9
    164                 x5 = unicode(x[5],'utf-8')
    165                                 
    166                 result_list = []
    167                 result_list.append(x0)
    168                 result_list.append(x1)
    169                 result_list.append(detail_url)
    170                 result_list.append(x3)
    171                 result_list.append(x4)
    172                 result_list.append(x5)
    173                 result_list.append(x6)
    174                 result_list.append(x9)
    175                 result_list.append(x12)
    176                 #0开始为  x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况、x8服务情况 
    177                 counter1.results.append(result_list)
    178             
    179 
    180 def save_downpic(lis,file_open,savePath):
    181     '''从商品list下载图片到reserve_file_path,并写入信息至fileopen'''
    182     #0开始为  x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况、x8服务情况、x9:rate
    183     len_list = len(lis)
    184     print 'we find:',len_list,'products'
    185     cc = 0        
    186     for x in lis:
    187         try :
    188             urllib.urlretrieve(x[1], savePath+'\%s___'%cc +unicode(x[0],'utf-8')+'.jpg')
    189 
    190             txt_name = savePath+'\'+ '%s__'%cc+ 'custome_description_%s __'%x[7] +'__comments_%s_'%x[4]+ '___price_%srmb___'%x[3] +x[5] +'.txt'
    191                     
    192             file_o = open(txt_name, 'a')
    193             file_o.write(x[2])
    194             file_o.close()
    195             
    196             print '
    get_one_possible_fine_goods:
    ','good_name:',x[0].decode('utf-8')
    197             print 'rate=',x[9]
    198             print 'price:',x[3],x[5].decode('utf-8')
    199             print 'custome_description:',x[7],'--','described_number:',x[4],'  service:',x[8]
    200             print x[2].decode('utf-8'),'
    good_pic_url:',x[1].decode('utf-8')
    201 
    202             print txt_name
    203             print cc+1,"th"
    204 
    205             file_open.write(u'%s__'%cc 
    206                         + str(x[0]) 
    207                         + '
    price:' 
    208                         + str(x[3])  
    209                         + '¥,
    '    
    210                         + str(x[2]) + '  
    ' + str(x[5]) + '
    customer_description:' + str(x[7]) + 'described_number:' + str(x[4])+'
    
    
    ')
    211             
    212 
    213             print 'get one -^-'
    214         except :
    215             print "failed to down picture or creat txt"
    216             counter1.fail_time += 1
    217         cc += 1
    218         time.sleep(0.5)
    219 
    220 
    221         
    222 def get_market_totalCount(url):
    223     html = urllib2.urlopen(url)
    224     code = html.read()
    225     reg = '"pageSize":[0-9]*?,"totalPage":[0-9]*?,"currentPage":[0-9]*?,"totalCount":([0-9]*?)}'
    226     totalCount = int(re.findall(reg,code)[0])
    227     
    228     return totalCount
    229     
    230 #"pageSize":44,"totalPage":100,"currentPage":3,"totalCount":29561
    231 
    232 
    233 def get_all_praised_goods(serchProd,counts,savePath ,keyword, price_min=0,price_max=0,descripHrequ =0,servHrequ=0 ,descripNrequ=0):
    234     #边里搜索结果每一页
    235     #initial url and page number
    236     initial_url = 'https://s.taobao.com/search?q='+serchProd + '&_input_charset=utf-8'
    237 
    238     if counter1.tm_tb == 'tmall':
    239         initial_url = initial_url + '&filter_tianmao=tmall'
    240 
    241     if  price_min:
    242         if price_min < price_max :
    243             initial_url = initial_url+'&filter=reserve_price%5B'+'%s'%price_min+'%2C' +'%s'%price_max
    244     initial_url = initial_url +'&cd=false&%5D&s='
    245 
    246     #tian_mall = 'https://list.tmall.com/search_product.htm?q='
    247 
    248     print "initial_url",initial_url+'0'
    249 
    250     page_n = 0
    251     reserve_file = savePath+r'found_goods.txt'
    252     file_open = open(reserve_file,'a')
    253 
    254     file_open.write('****************************
    ')
    255     file_open.write(time.ctime())
    256     file_open.write('
    ****************************
    ')
    257 
    258     total = get_market_totalCount(initial_url+'0')
    259     print "totalcount",total
    260     if total>counts*10:
    261         total = sqrt(total)
    262 
    263     while counter1.new_flag and counter1.try_find<total :
    264         
    265         url_1 = initial_url + str(44*page_n)
    266         #print initial_url
    267         print 'url_1:', url_1
    268         #print 'ss',initial_url+'%s'%(44*page_n)
    269         page_n += 1
    270 
    271         get_praised_good(url_1,file_open,keyword,counts,descripHrequ,servHrequ ,descripNrequ)
    272         print "let web network rest for 1s lest  make traffic jams "
    273         time.sleep(1)
    274         # except:
    275         print page_n, "pages have been searched"            
    276         if total < counts :
    277             print "check keyword,maybe too restrict"
    278             break
    279 
    280     print url_1     
    281     product_rank(counter1.results)
    282 
    283     counter1.results.sort(key = lambda x : x[9], reverse=True)      
    284     counter1.results = counter1.results[:counts]
    285     
    286     counter1.print_counter()
    287     
    288     save_downpic(counter1.results,file_open,savePath)
    289     
    290     #
    291     for a in  counter1.results:
    292         for b in a :
    293             file_open.write(unicode(str(b),'utf-8'))
    294             file_open.write('	')
    295         file_open.write('
    
    ')
    296     
    297     file_open.close()
    298     counter1.print_counter()
    299 
    300 
    301 counter1 = counter()
    302 
    303 market_totalcounts = 0
    304 
    305 
    306 req_s = requests.Session()
    307 req_s.adapters.DEFAULT_RETRIES = 3
    308 req_s.keep_alive = True  
    309 
    310 
    311 def main():
    312     print "说明:".decode('utf-8') 
    313     print '本程序用于在淘宝上搜索商品时主动通过 价格范围、商品描述、服务态度、评论数来筛选商品;
    筛选出来的商品图片下载保存到磁盘(默认桌面新建find_worty_goods文件夹)并建立同序号开头的txt文件,图片显示商品,其旁的txt文件名显示价格等关键信息,txt里保存商品的淘宝链接'.decode('utf-8')  
    314     
    315     if setting.userDefine:      #自己输入 配置参数-筛选要求
    316         setting.inputPara() 
    317                     #否则  使用setting中的配置参数
    318 
    319     serchProd   = setting.serchProd         #淘宝搜索词
    320     keyword     = setting.keyword               #raw_input().decode("gbk").encode("utf-8")       #个人限定词,商品名字必须包含,防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制
    321     price_min   = setting.price_min         #价格区间
    322     price_max   = setting.price_max
    323     descripHrequ = setting.descripHrequ    # %   默认高于average, 输出结果大于此值
    324     servHrequ    = setting.servHrequ        # %  默认高于average, 输出结果大于此值
    325     descripNrequ = setting.descripNrequ
    326     counts       = setting.counts               #要求选出多少个商品
    327     counter1.tm_tb = setting.tm_tb          #不区分天猫淘宝则,字符串为空,,只要天猫 则 ='tmall' ,只要淘宝 = 'taobao'
    328 
    329     #savePath = r"C:UsersAdministratorDesktopPython scrapyfind_worthy_goods
    esults"#结果保存路径
    330     savePath = u"results%s"%serchProd #结果保存路径
    331     savePath = makeNewdir(savePath)
    332     
    333     get_all_praised_goods(serchProd, counts, savePath, keyword, price_min, price_max ,descripHrequ ,servHrequ ,descripNrequ)
    334 
    335 
    336 if __name__ == "__main__" :
    337     main()
    338 
    339     
    340     #保存图片,以文件名为商品图片名字,并以序号开头
    341     #同时,输出 价格、商家名,商品描述、服务等 到 txt文本
    342     #在商品图片看中后,便可按序号查找 
    343     #按描述、服务评价高于平均,购物体验应该可以的
    View Code

    setting.py

    # -*- coding: utf-8 -*-
    
    userDefine = False
    #筛选要求设置
    
    serchProd='背包'     #淘宝搜索词
    keyword=''                 #raw_input().decode("gbk").encode("utf-8")        #个人限定词,商品名字必须包含,防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制
    price_min=22            #价格区间
    price_max=100
    descripHrequ=0       # %   默认高于average, 输出结果大于此值
    servHrequ=0          # %  默认高于average, 输出结果大于此值
    descripNrequ=6
    counts=25            #要求选出多少个商品
    tm_tb ='tmall'       #不区分天猫淘宝则,字符串为空,,只要天猫 则 ='tmall' ,只要淘宝 = 'taobao'
    
    
    def inputPara():
        ''' 用户选择是否自定义要求,根据要求进行获取商品,并按推荐排序输出'''
        print "please input reserch _goods_name"
        global serchProd , keyword , price_min, price_max, descripHrequ , servHrequ,  descripNrequ ,counts ,tm_tb
    
        serchProd=raw_input().replace(' ','')    #淘宝搜索词 ,并去除中间意外输入的空格
    
        if serchProd:
            
            print "if customise price_range ,decriptiom require .etc.
    input Y/N 
     default by : no price limit avarage than descriptiom,get 50 products 
     默认要求为:无价格限制,商品描述、快递、服务高于均值,获取50个商品。自定义要求请输入 ‘Y’ (区分大小写)".decode('utf-8')
            if raw_input() == 'Y':
                print "
    please input  _minimal price and _maximal price;   
    default by 0,10000
    next by 'enter'key input nothing means by default,the same below "
                print '请输入价格范围 ;默认0-10000 ;两项用半角逗号","分隔 按回车键确认;什么也不输入代表使用默认值 '.decode('utf-8')
                try:
                    price_min, price_max=input()
                except:
                    print 'not input or wrong number,use default range'
                    price_min, price_max = 0 ,10000
                
                #
                print '是否要求 只看天猫/正品保障  还是只看淘宝 
     只看天猫输入 tmall ,只看淘宝输入taobao,都看则回车略过'
                try:
                    tm_tb=raw_input().decode("gbk").encode("utf-8")      #个人限定词,商品名字必须包含,防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制
                except:
                    tm_tb=''
                #
                    # #
                print "please input _keyword that goods name must include:
    (more than one keyword must use Regular Expression); default by no kewords"
                try:
                    keyword=raw_input().decode("gbk").encode("utf-8")      #个人限定词,商品名字必须包含,防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制
                except:
                    keyword=''
                #    
    
                print "
    please input  _description_higher_percent_require and _service_higher__percent_require
     range:(-100,100) ;   
    default by 0,0  I.e better than average"
                print '请输入商品描述、服务高于平均值的百分比-100 ~100'.decode('utf-8')
                     # %   默认高于average, 输出结果大于此值
                try:
                    descripHrequ,servHrequ=input()              
                except:
                    print 'not input or wrong number,use default range'
                    descripHrequ = 0  # %  默认高于average, 输出结果大于此值
                    servHrequ = 0
                
                #    
                print "
    please input description count limit,  default more than 5
    " ,'输入最低商品评价数,默认大于5'.decode('utf-8')
                try:
                    descripNrequ=input()
                except :
                    print 'not input or wrong number,use default range'
                    descripNrequ=5
                #
                    
                    # print "
    IF customise file reserve path, Y or N  
    default/sample as:  C:\Users\Administrator\Desktop\find_worthy_goods\results "
                    # print '是否自定义保存文件目录 Y or N'.decode('utf-8')
                    # if raw_input()=='Y':
                    #     print "please input path that you want to reserve;  
     "    
                    #     savePath = raw_input()
                    # else:
                    #     #savePath=r"C:UsersAdministratorDesktopfind_worthy_goods
    esults"#结果保存路径        
                #
                print "
    please input how many results you want,  default by 50
    " ,'您要获取的商品数目,默认50'.decode('utf-8')
                try:
                    counts=input()
                except :
                    counts=50
            else :
                counts =50
                keyword = ''
                tm_tb = ''
                price_min ,price_max ,descripHrequ ,servHrequ ,descripNrequ = 0,0,0,0,0
        else:
            print "no search goods,please restart"
            print '没有输入商品名称,请重新启动程序'.decode('utf-8')
    View Code
  • 相关阅读:
    python记录程序运行时间的三种方法
    LeetCode 922. 按奇偶排序数组 II 做题小结
    LeetCode 976. 三角形的最大周长 做题小结
    LeetCode 1122. 数组的相对排序 做题小结
    LeetCode1528. 重新排列字符串 做题小结
    LeetCode 738. 单调递增的数字 做题小结
    selenium——鼠标操作ActionChains:点击、滑动、拖动
    剑指 Offer 32
    剑指 Offer 32
    二叉树的应用:二叉排序树的删除
  • 原文地址:https://www.cnblogs.com/willowj/p/6266507.html
Copyright © 2011-2022 走看看