zoukankan      html  css  js  c++  java
  • 爬虫BS4—淘女郎

    1.修改网页头

    用独自的py文件getheaders,随机返回header

    getheaders文件

    import random

    headerstr = """Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
    Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0
    Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999 """


    def headers():
    header = headerstr.split(" ")
    length = len(header)
    return header[random.randint(0, length - 1)]




    2.主文件
    
    
    # coding:utf-8
    from bs4 import BeautifulSoup
    import urllib2
    from getheaders import headers
    from json import loads
    import re
    import os


    def reqobject(): # 实例化一个请求对象,还没有访问
    # 1、实例化一个请求对象,还没有访问
    req = urllib2.Request("https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8")
    # 2、对请求对象进行加工,添加用户头
    req.add_header('user-agent', headers())
    return req


    def getUrlList(): # 获取页面所有的用户信息
    req = reqobject()
    # 1.2.再次对对象进行加工,添加参数
    req.add_data(
    'q&viewFlag=A&sortType=default&searchStyle=&searchRegion=city%3A&searchFansNum=&currentPage=1&pageSize=100')
    # 3、访问对象并解码+编码
    # """
    # decode('gbk') 解码:吧gbk格式解码为Unicode
    # decode解码时要对应网页的respon heasers里面的content-type:text/html;charset=GBK
    # 若没有则查看网页源代码头部<meta charset="gbk" />
    # encode('utf-8') 编码:把Unicode编码为utf-8
    # encode只能编码Unicode
    # """
    html = urllib2.urlopen(req).read().decode('gbk').encode('utf-8')
    # 4、取值,html为一个json对象,先转化为dict,便于取值
    json_dict = loads(html)
    # 5、返回一个列表
    return json_dict['data']['searchDOList']


    def getInfo(userid): # 获取用户的“她的爱秀”
    req = urllib2.Request("https://mm.taobao.com/self/aiShow.htm?&userId=%s" % userid)
    req.add_header('user-agent', headers())
    html = urllib2.urlopen(req).read().decode('gbk').encode('utf-8')
    return html


    def getNeedInf(html): # 提取我们需要的信息
    soup = BeautifulSoup(html, 'html.parser')
    name = soup.select('dl > dd > a')[0].text.encode('utf-8')
    follow = soup.select('dl > dt > a')[1].text.encode('utf-8')
    fens = soup.select('dl > dt > a')[2].text.encode('utf-8')
    detail = soup.find('div', attrs={'id': 'J_ScaleImg'}).get_text().strip().encode('utf-8')
    content = "姓名:{} 关注:{} 粉丝:{} {}".format(name, follow, fens, detail)
    if os.path.exists("images\" + str(userid)) == False:
    os.mkdir("images\" + str(userid))
    print 'Start downloading...'
    print 'getInf:{}'.format(str(userid))
    with open("images\{}\{}.txt".format(str(userid), str(userid)), 'wb') as f:
    f.write(content)


    def getAlbumList(userid): # 获取用户的“相册”和相册的封面照片链接
    req = urllib2.Request("https://mm.taobao.com/self/album/open_album_list.htm?&user_id%20=" + str(userid)) # 相册链接
    req.add_header('user-agent', headers())
    html = urllib2.urlopen(req).read().decode('gbk').encode('utf-8')
    # 获取每一个相册的链接
    rel = r'class="mm-first" href="//(.*?)"'
    AlbumListurl = re.findall(rel, html)
    # 获取每一个相册的封面的链接,用于下载封面图片
    # rel = r'<img src="//(.*?jpg_240x240xz.jpg)" width="125" height="125">'
    # 爬取出来的链接:img.alicdn.com/imgextra/i1/176817195/TB1jFcMKFXXXXblXFXXXXXXXXXX_!!0-tstar.jpg_240x240xz.jpg
    # 我们需要的链接:img.alicdn.com/imgextra/i1/176817195/TB1jFcMKFXXXXblXFXXXXXXXXXX_!!0-tstar.jpg_620x10000.jpg
    # 相差为【jpg_240x240xz.jpg】和【jpg_620x10000.jpg】所以将【jpg_240x240xz.jpg】写在括号外面
    # 爬到链接img.alicdn.com/imgextra/i1/176817195/TB1jFcMKFXXXXblXFXXXXXXXXXX_!!0-tstar.
    # 再补齐【jpg_620x10000.jpg】,如下
    rel = r'<img src="//(.*?)jpg_240x240xz.jpg" width="125" height="125">'
    AlbumListCoverurl = re.findall(rel, html)
    getAlbumListCoverurl = []
    for url in AlbumListCoverurl:
    url += "jpg_620x10000.jpg"
    url = "http://" + url
    getAlbumListCoverurl.append(url)
    return getAlbumListCoverurl


    def getimages(userid, urls): # 通过图片链接下载图片
    # http://img.alicdn.com/imgextra/i3/865838484/TB1_n_XKVXXXXb5XXXXXXXXXXXX_!!865838484-0-tstar.jpg_620x10000
    # if os.path.exists("images\" + str(userid)) == False:
    # os.mkdir("images\" + str(userid))
    i = 1
    for url in urls:
    req = urllib2.Request(url)
    req.add_header('user-agent', headers())
    html = urllib2.urlopen(req).read()
    # with open('images\'+str(userid)+"\" + str(i) + '.jpg', 'wb') as f:
    with open('images\{}\{}.jpg'.format(str(userid), str(i)), 'wb') as f:
    f.write(html)
    print "getImage:", url
    i += 1
    print "End of download..."


    for user in getUrlList():
    if os.path.exists("images") == False:
    os.mkdir("images")
    try:
    userid = user['userId']
    html = getInfo(userid)
    getNeedInf(html)
    # for i in getAlbumList(userid):
    # print i
    urls = getAlbumList(userid)
    getimages(userid, urls)
    except urllib2.URLError,e:
    print e.reason
     
     
  • 相关阅读:
    codeforces 616B Dinner with Emma
    codeforces 616A Comparing Two Long Integers
    codeforces 615C Running Track
    codeforces 612C Replace To Make Regular Bracket Sequence
    codeforces 612B HDD is Outdated Technology
    重写父类中的成员属性
    子类继承父类
    访问修饰符
    方法的参数
    实例化类
  • 原文地址:https://www.cnblogs.com/wskxy/p/7399881.html
Copyright © 2011-2022 走看看