zoukankan      html  css  js  c++  java
  • python爬取妹子图全站全部图片-可自行添加-线程-进程爬取,图片去重

    from bs4 import BeautifulSoup
    import sys,os,requests,pymongo,time
    from lxml import etree
    def get_fenlei(url,headers):
    """
    获取妹子图的分类链接,并创建相对应的分类文件夹
    :param url:
    :param headers:
    :return:
    """
    response = requests.get(url,headers=headers).text
    soup = BeautifulSoup(response,'lxml')
    fenlei_url = soup.select('#menu-nav > li > a')
    list1 = []
    print(sys.path)
    for i in fenlei_url:
    fen_lei_lianjie = i['href']
    if ''.join(fen_lei_lianjie).split('/')[3] =='all':
    continue
    elif ''.join(fen_lei_lianjie).split('/')[3] != '':
    fenlei_name = i.get_text()
    print(fenlei_name,fen_lei_lianjie,'首页')
    get_fenlei_xia(fen_lei_lianjie,fenlei_name)
    list1.append(fen_lei_lianjie)
    def get_fenlei_xia(fen_lei_lianjie,fenlei_name):
    """

    :param fen_lei_lianjie:
    :param fenlei_name:
    :return:
    """
    print('{}{}'.format(fen_lei_lianjie,'<><><><><><>'))
    response = requests.get(fen_lei_lianjie,headers=headers).text
    html = etree.HTML(response)
    fenye_page = html.xpath('/html/body/div[2]/div[1]/div[2]/nav/div/a[4]')
    page_list = []
    if fenye_page != []:
    for i in fenye_page:
    page_shu = i.xpath('./text()')[0]
    page_url = i.xpath('./@href')[0]
    for ia in range(1,int(page_shu)+1):
    fenlei_url = '{}/{}/{}/{}/'.format('https://www.mzitu.com/',''.join(page_url).split('/')[3],'page',ia)
    page_list.append(fenlei_url)
    else:
    print('{}'.format('没有数据11111'))
    fenye_page2 = html.xpath('//*[@id="comments"]/div/a[3]')
    if fenye_page2 != []:
    for aa in fenye_page2:
    shuliang_shu = aa.xpath('./text()')[0]
    shuliang_url = aa.xpath('./@href')[0]
    for page in range(1,int(shuliang_shu)+3):
    shen_url = '{}{}/{}/{}/{}'.format('https://www.mzitu.com/',''.join(shuliang_url).split('/')[3],'comment-page-',page,'#comments')
    page_list.append(shen_url)
    else:
    print('{}'.format('没有数据222222'))
    for shu in page_list:
    get_all_url(shu,fenlei_name)

    def get_all_url(shu,fenlei_name):
    """
    爬取分页下的图片链接
    :param shu:
    :param fenlei_name:
    :return:
    """
    print(shu,'<><><><><><><><>')
    response = requests.get(shu,headers=headers).text
    soup = BeautifulSoup(response,'lxml')
    img_src = soup.select('#pins > li > a')
    page_list = []
    for i in img_src:
    img_src = i['href']
    page_list.append(img_src)
    get_img_gref(img_src,fenlei_name)
    def get_img_gref(img_src,fenlei_name):
    """
    下载图片
    :param img_src:
    :param fenlei_name:
    :return:
    """
    print(img_src,'<><><><><><><><><><><<')
    try:
    response = requests.get(img_src,headers=headers)
    print(response.status_code,'页面状态码')
    html = etree.HTML(response.text)
    img_href = html.xpath('//div[2]/div[1]/div[3]/p/a/img/@src')[0]
    lei_name = html.xpath('/html/body/div[2]/div[1]/div[2]/span[1]/a/text()')[0]
    if fenlei_name == lei_name:
    wenjian = 'D:\web_xiangmu\biquge_tushu\妹子图\' + fenlei_name
    if not os.path.exists(wenjian):
    os.makedirs(wenjian)
    print('{}{}{}'.format('D:\web_xiangmu\biquge_tushu\妹子图\<<<', fenlei_name, '>>>分类文件夹创建成功'))
    else:
    print(print('{}{}{}'.format('D:\web_xiangmu\biquge_tushu\妹子图\<<<', fenlei_name, '>>>分类文件夹已存在')))
    """
    图片去重
    """
    img_name = ''.join(img_href).split('/')[-1]
    res = requests.get(img_href, headers=headers)
    root_dir = 'D:\web_xiangmu\biquge_tushu\妹子图\' + fenlei_name
    panduan_root_dir = 'D:\web_xiangmu\biquge_tushu妹子图\{}\{}'.format(fenlei_name,img_name)
    if os.path.exists(panduan_root_dir):
    print('{}{}'.format(img_name,'已存在'))
    else:
    with open(root_dir + "\" + img_name, 'wb') as f:
    f.write(res.content)
    f.close()
    print(fenlei_name + '---' + img_name + '文件保存成功')
    """
    图片去重
    """
    else:
    return None
    except:
    print('《《《连接失败》》》')

    def main():
    return get_fenlei(url,headers)

    if __name__ == '__main__':
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    'Referer': 'https://www.mzitu.com/hot/',

    }
    url = "https://www.mzitu.com/"
    main()
  • 相关阅读:
    寒假学习记录07
    寒假学习记录06
    寒假学习记录05
    寒假学习记录04
    寒假学习记录03
    寒假学习记录02
    寒假学习记录01
    河北省重大技术需求征集系统(13)
    学习进度(4)
    学习进度(3)
  • 原文地址:https://www.cnblogs.com/duanlinxiao/p/11995421.html
Copyright © 2011-2022 走看看