zoukankan      html  css  js  c++  java
  • 爬虫BS4—淘女郎

    1.修改网页头

    用独自的py文件getheaders,随机返回header

    getheaders文件

    import random

    headerstr = """Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
    Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0
    Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999 """


    def headers():
    header = headerstr.split(" ")
    length = len(header)
    return header[random.randint(0, length - 1)]




    2.主文件
    
    
    # coding:utf-8
    from bs4 import BeautifulSoup
    import urllib2
    from getheaders import headers
    from json import loads
    import re
    import os


    def reqobject(): # 实例化一个请求对象,还没有访问
    # 1、实例化一个请求对象,还没有访问
    req = urllib2.Request("https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8")
    # 2、对请求对象进行加工,添加用户头
    req.add_header('user-agent', headers())
    return req


    def getUrlList(): # 获取页面所有的用户信息
    req = reqobject()
    # 1.2.再次对对象进行加工,添加参数
    req.add_data(
    'q&viewFlag=A&sortType=default&searchStyle=&searchRegion=city%3A&searchFansNum=&currentPage=1&pageSize=100')
    # 3、访问对象并解码+编码
    # """
    # decode('gbk') 解码:吧gbk格式解码为Unicode
    # decode解码时要对应网页的respon heasers里面的content-type:text/html;charset=GBK
    # 若没有则查看网页源代码头部<meta charset="gbk" />
    # encode('utf-8') 编码:把Unicode编码为utf-8
    # encode只能编码Unicode
    # """
    html = urllib2.urlopen(req).read().decode('gbk').encode('utf-8')
    # 4、取值,html为一个json对象,先转化为dict,便于取值
    json_dict = loads(html)
    # 5、返回一个列表
    return json_dict['data']['searchDOList']


    def getInfo(userid): # 获取用户的“她的爱秀”
    req = urllib2.Request("https://mm.taobao.com/self/aiShow.htm?&userId=%s" % userid)
    req.add_header('user-agent', headers())
    html = urllib2.urlopen(req).read().decode('gbk').encode('utf-8')
    return html


    def getNeedInf(html): # 提取我们需要的信息
    soup = BeautifulSoup(html, 'html.parser')
    name = soup.select('dl > dd > a')[0].text.encode('utf-8')
    follow = soup.select('dl > dt > a')[1].text.encode('utf-8')
    fens = soup.select('dl > dt > a')[2].text.encode('utf-8')
    detail = soup.find('div', attrs={'id': 'J_ScaleImg'}).get_text().strip().encode('utf-8')
    content = "姓名:{} 关注:{} 粉丝:{} {}".format(name, follow, fens, detail)
    if os.path.exists("images\" + str(userid)) == False:
    os.mkdir("images\" + str(userid))
    print 'Start downloading...'
    print 'getInf:{}'.format(str(userid))
    with open("images\{}\{}.txt".format(str(userid), str(userid)), 'wb') as f:
    f.write(content)


    def getAlbumList(userid): # 获取用户的“相册”和相册的封面照片链接
    req = urllib2.Request("https://mm.taobao.com/self/album/open_album_list.htm?&user_id%20=" + str(userid)) # 相册链接
    req.add_header('user-agent', headers())
    html = urllib2.urlopen(req).read().decode('gbk').encode('utf-8')
    # 获取每一个相册的链接
    rel = r'class="mm-first" href="//(.*?)"'
    AlbumListurl = re.findall(rel, html)
    # 获取每一个相册的封面的链接,用于下载封面图片
    # rel = r'<img src="//(.*?jpg_240x240xz.jpg)" width="125" height="125">'
    # 爬取出来的链接:img.alicdn.com/imgextra/i1/176817195/TB1jFcMKFXXXXblXFXXXXXXXXXX_!!0-tstar.jpg_240x240xz.jpg
    # 我们需要的链接:img.alicdn.com/imgextra/i1/176817195/TB1jFcMKFXXXXblXFXXXXXXXXXX_!!0-tstar.jpg_620x10000.jpg
    # 相差为【jpg_240x240xz.jpg】和【jpg_620x10000.jpg】所以将【jpg_240x240xz.jpg】写在括号外面
    # 爬到链接img.alicdn.com/imgextra/i1/176817195/TB1jFcMKFXXXXblXFXXXXXXXXXX_!!0-tstar.
    # 再补齐【jpg_620x10000.jpg】,如下
    rel = r'<img src="//(.*?)jpg_240x240xz.jpg" width="125" height="125">'
    AlbumListCoverurl = re.findall(rel, html)
    getAlbumListCoverurl = []
    for url in AlbumListCoverurl:
    url += "jpg_620x10000.jpg"
    url = "http://" + url
    getAlbumListCoverurl.append(url)
    return getAlbumListCoverurl


    def getimages(userid, urls): # 通过图片链接下载图片
    # http://img.alicdn.com/imgextra/i3/865838484/TB1_n_XKVXXXXb5XXXXXXXXXXXX_!!865838484-0-tstar.jpg_620x10000
    # if os.path.exists("images\" + str(userid)) == False:
    # os.mkdir("images\" + str(userid))
    i = 1
    for url in urls:
    req = urllib2.Request(url)
    req.add_header('user-agent', headers())
    html = urllib2.urlopen(req).read()
    # with open('images\'+str(userid)+"\" + str(i) + '.jpg', 'wb') as f:
    with open('images\{}\{}.jpg'.format(str(userid), str(i)), 'wb') as f:
    f.write(html)
    print "getImage:", url
    i += 1
    print "End of download..."


    for user in getUrlList():
    if os.path.exists("images") == False:
    os.mkdir("images")
    try:
    userid = user['userId']
    html = getInfo(userid)
    getNeedInf(html)
    # for i in getAlbumList(userid):
    # print i
    urls = getAlbumList(userid)
    getimages(userid, urls)
    except urllib2.URLError,e:
    print e.reason
     
     
  • 相关阅读:
    冒泡排序
    Objective-C 命名规范
    时间轴的制作
    CocoaPods 哪些事
    消息转发机制入门篇
    架构
    算法学习
    AutoLayout自动布局
    网络学习
    HDU 3832 Earth Hour (最短路)
  • 原文地址:https://www.cnblogs.com/wskxy/p/7399881.html
Copyright © 2011-2022 走看看