zoukankan      html  css  js  c++  java
  • 简易天猫爬虫

    天猫商品数据爬取代码分享

    虽然很简陋但是写这个程序我学到了一些新的技术,比如openpyxl库的使用,python的打包啊,设置图标啥的,还是收获很多.

    闲话不多说,直接上代码

    #导入需要的库
    import re
    import urllib.parse
    import requests
    from openpyxl import Workbook
    from openpyxl.styles import Font,Alignment
    import os
    
    #创建工作簿对象
    wb = Workbook()
    
    #我们使用Workbook对象的默认创建的工作表
    ws = wb.active
    
    #标题命名
    ws['A1'] = '商品名称'
    ws['B1'] = '商品价格'
    ws['C1'] = '产地'
    ws['D1'] = '月成交量'
    ws['E1'] = '商品链接'
    
    #提示用户输入,查找的商品名,页数,保存的地址,和文件名
    print('----欢迎使用----')
    keyword = input("请输入你要查找的商品名称:")
    frequency = int(input("请输入你要下载的页数(1~100):"))
    name = input("请输入你要保存的文件名:")+'.xlsx'
    
    #列宽判断的依据
    width = 0
    
    #冻结第一行
    ws.freeze_panes = 'A2'
    
    #标题格式设置居中
    ws['A1'].alignment =Alignment(horizontal = 'center',vertical ='center')
    ws['B1'].alignment =Alignment(horizontal = 'center',vertical ='center')
    ws['C1'].alignment =Alignment(horizontal = 'center',vertical ='center')
    ws['D1'].alignment =Alignment(horizontal = 'center',vertical ='center')
    ws['E1'].alignment =Alignment(horizontal = 'center',vertical ='center')
    
    #设置标题字号为20,加粗
    ws['A1'].font = Font(size= 20,bold=True)
    ws['B1'].font = Font(size= 20,bold=True)
    ws['C1'].font = Font(size= 20,bold=True)
    ws['D1'].font = Font(size= 20,bold=True)
    ws['E1'].font = Font(size= 20,bold=True)
    
    #设置商品价格,成交量,产地的列宽
    ws.column_dimensions['B'].width = 20
    ws.column_dimensions['C'].width = 20
    ws.column_dimensions['D'].width = 20
    
    #中文关键字的编码
    keyword = urllib.parse.quote(keyword)
    
    #请求头的处理
    headers = {'user_agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.10 Safari/537.36',
            'cookie':'cna=dzhnFJcvPFYCAcplZsR6KtPL; hng=CN%7Czh-CN%7CCNY%7C156; lid=su%E3%80%81%E9%9F%A9; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; tk_trace=1; t=cf98821548a85be3261b5d3e02dfc50c; _tb_token_=53317e736e697; cookie2=171ec1fa788a97042140b1dc23ea8cbd; _m_h5_tk=ce5a4b969c5fee0ce43fbecb1c8b5698_1544366646609; _m_h5_tk_enc=08f79f8dd140168b3e95d192521140f3; x=__ll%3D-1%26_ato%3D0; whl=-1%260%260%260; uc1=cookie16=V32FPkk%2FxXMk5UvIbNtImtMfJQ%3D%3D&cookie21=UIHiLt3xTIkz&cookie15=UtASsssmOIJ0bQ%3D%3D&existShop=false&pas=0&cookie14=UoTYMh2PRIzYCw%3D%3D&tag=8&lng=zh_CN; uc3=vt3=F8dByR1fSHC9rqvO0Hw%3D&id2=UUGk2VnYR7N9og%3D%3D&nk2=saDewT4jhA05glO9&lg2=V32FPkk%2Fw0dUvg%3D%3D; tracknick=%5Cu4E00%5Cu5207%5Cu968F%5Cu7F18%5Cu4E28%5Cu4E36; _l_g_=Ug%3D%3D; ck1=""; unb=2967018108; lgc=%5Cu4E00%5Cu5207%5Cu968F%5Cu7F18%5Cu4E28%5Cu4E36; cookie1=AiVdFlFBrPvLkxJuDQ%2FIWWWqMYV30iZYcqUsqvmxAjc%3D; login=true; cookie17=UUGk2VnYR7N9og%3D%3D; _nk_=%5Cu4E00%5Cu5207%5Cu968F%5Cu7F18%5Cu4E28%5Cu4E36; uss=""; csg=da3a7af3; skt=22f8e8af802abead; enc=P0JAHDOULky9KTinsCWQ4Ib6YVG7q7qPW5KKCJd4YWKlwiYOGGRObgbMOWOpxn4w12VNH34hJK%2FVCxsPmDqs%2FQ%3D%3D; pnm_cku822=098%23E1hvc9vUvbpvUvCkvvvvvjiPR2FWljlUn2qw6jEUPmPZ1jrERFdO1jYUnLS9zjtUiQhvCvvvpZptvpvhvvCvpvGCvvpvvPMMvphvC9mvphvvvvyCvhQv7sg%2FjNpBKBh78BoxfXkXdiYso%2BLpjXe4Vc3Z0f06W3vOJ1kHsfUpeB6AxYjxRLwprj6OfwoKjd8rJm7g%2BfUz%2BsIIHYFpeiQa5javuphvmvvvpoX8LTuKkphvC9hvpyPw1byCvm9vvhCvvvvvvvvvBfIvvvjivvCVB9vv9LvvvhXVvvmCjvvvByOvvUhw; cq=ccp%3D0; swfstore=199766; isg=BG5uugvevlR2SM3ay3guf99Uv8ScezhtaUhC2pg36XEsew_VAP0keVBxNqcy_yqB'}
    
    #url的处理
    url1 = "https://list.tmall.com/search_product.htm?spm=a220m.1000858.0.0.36105702i4oQH9&s="
    url2 ="&q=" +keyword+"&sort=s&style=g&from=..pc_1_searchbutton&active=2&type=pc#J_Filte"
    
    #循环爬取每一页,正则提取商品链接,商品名,价格,销量
    for i in range(1,frequency+1):
    
        #跳过访问发生的异常,并输出异常信息
        try:
            print("----正在爬取第%d页----"%i)
            url = url1 +str((i-1)*60)+url2
            r = requests.get(url,headers = headers)
            names = re.compile('target="_blank" title="(.*?)"',re.S).findall(r.text)
            if(len(names)):
                print('访问成功')
            else:
                print("访问失败,请更改代码的里cookie,或者明天再使用")
            urls = re.compile('<div class="productImg-wrap">
    <a href="(.*?)" class="productImg" target="_blank" data-p="',re.S).findall(r.text)
            chengjiaoliangs = re.compile('<span>月成交 <em>(.*?)笔',re.S).findall(r.text)
            moneys = re.compile('<em title="(.*?)"><b>&yen',re.S).findall(r.text)
    
            #访问商品详细信息,爬取产地
            wheres = []
            for x in range(len(urls)):
    
                #跳过获取商品地区信息时发生的异常,并输出异常
                try:
                    wurl = 'http:'+urls[x]
                    w = requests.get(wurl,headers = headers)
                    wheres.append((re.compile('name="region" value="(.*?)"',re.S).findall(w.text))[0])
                    if(len(wheres[x])):
                        print('第%d页%d条商品信息爬取成功'%(i,x+1))
                except Exception as er:
                    print(er)
            if (len(names)):
                print("----第%d页爬取成功----" % i)
            print("----第%d页开始写入----"%i)
    
            #循环写入excel表格
            for y in range(1,len(urls)):
    
                #跳过写入表格时发生的异常,并输出异常的信息
                try:
    
                    #设置商品名称和商品链接自适应列宽
                    if(len(names[y-1])> width):
                        ws.column_dimensions['A'].width = 2*len(names[y-1])
                    if (len(urls[y - 1]) > width):
                        ws.column_dimensions['E'].width = len(urls[y - 1])
    
                    #分别写入数据
                    ws['A%d' % ((i - 1) * 60 + y + 1)] = names[y-1]
                    ws['B%d' % ((i - 1) * 60 + y + 1)] = moneys[y - 1]+'元'
                    ws['C%d' % ((i - 1) * 60 + y + 1)] = wheres[y - 1]
                    ws['D%d' % ((i - 1) * 60 + y + 1)] = chengjiaoliangs[y - 1]+'笔'
                    ws['E%d' % ((i - 1) * 60 + y + 1)] = 'http:'+ urls[y - 1]
                    print('----第%d页第%d条写入成功----'%(i,y+1))
                except Exception as e:
                    print('----第%d页第%d条写入失败----'%(i,y+1))
            print("----第%d页写入成功----" % i)
        except Exception as err:
            print(err)
    
    #创建文件下载路径
    path = './天猫数据爬取excel文件/'
    if not os.path.exists(path):
        os.mkdir(path)
    
    #保存工作簿到指定路径
    wb.save(path +name)
    
    print('----已经全部写入----')
    print('----感谢使用----')
    os.system('pause')
    

  • 相关阅读:
    6410实现网卡(DM9000A)收发功能及ARP协议实现
    Shuffling Machine和双向链表
    Have Fun with Numbers及循环链表(约瑟夫问题)
    Tiny6410 LCD设置
    RAM与内存
    inet_addr解析
    map容器find用法
    WinSock编程(TCP)
    Python 时间序列作图及注释
    无法打开之前cuda的vs项目,打开之后变灰色
  • 原文地址:https://www.cnblogs.com/c-aha/p/10102494.html
Copyright © 2011-2022 走看看