zoukankan      html  css  js  c++  java
  • 爬吉他谱

    写个小爬虫,趴一趴吉他谱
    # -*- coding: utf-8 -*- #coding=UTF8 import os import sys import logging import urllib import urllib2 import chardet import re import cookielib import urlparse from bs4 import BeautifulSoup sysEncoding = sys.getfilesystemencoding() cookieJar = cookielib.CookieJar() def get(url): req = urllib2.Request(url) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar)) response = opener.open(req) return response.read() def download_guitar_image(url, target): print 'start download guitar image ...' req = urllib2.Request(url) req.add_header('Accept','image/webp,image/*,*/*;q=0.8') opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar)) response = opener.open(req) content = response.read() with open(target, 'wb') as code: code.write(content) #解析吉他谱图片页面链接地址 def parse_guitar_img_link(): page_list = [] url_base = 'http://www.17jita.com/' page = 1 while True: url = url_base + 'tab/img/index.php?page=' + str(page) print url html = get(url) soup = BeautifulSoup(html, "html5lib") list = soup.select('#ct dl > dt > a') if not list: break for item in list: page_list.append({ 'title' : item.text, 'link' : url_base + item['href'] }) page += 1 return page_list def download_guitar_image_link_list(url): image_link_list = [] page = 1 while True: page_url = url if page > 1: page_url = url.replace('.html', '' + str(page) + '.html') try: html = get(page_url) soup = BeautifulSoup(html, 'html5lib') img_list = soup.select('#article_contents a > img') for img in img_list: image_link_list.append(img['src']) except urllib2.URLError, e: msg = u'下载 ' + page_url + u' 出错, 原因: ' + e.reason print msg logging.error(msg) break page += 1 return image_link_list if __name__ == '__main__': logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S', filename='guitar.log', filemode='a') path = 'guitar' if not os.path.exists(path): os.mkdir(path) page_list = parse_guitar_img_link() for page in page_list: print page['link'] + '(' + page['title'] + ')' guitar_path = path + '/' + (page['title']).encode('GBK') if not os.path.exists(guitar_path): os.mkdir(guitar_path) image_link_list = download_guitar_image_link_list(page['link']) for image_link in image_link_list: print ' ' + image_link filename = image_link[image_link.rindex('/'):] filepath = guitar_path + filename.encode('GBK') download_guitar_image(image_link, filepath)

      

    小蟒蛇
  • 相关阅读:
    欧拉回路 定理
    UESTC 1087 【二分查找】
    POJ 3159 【朴素的差分约束】
    ZOJ 1232 【灵活运用FLOYD】 【图DP】
    POJ 3013 【需要一点点思维...】【乘法分配率】
    POJ 2502 【思维是朴素的最短路 卡输入和建图】
    POJ 2240 【这题貌似可以直接FLOYD 屌丝用SPFA通过枚举找正权值环 顺便学了下map】
    POJ 1860【求解是否存在权值为正的环 屌丝做的第一道权值需要计算的题 想喊一声SPFA万岁】
    POJ 1797 【一种叫做最大生成树的很有趣的贪心】【也可以用dij的变形思想~】
    js 实现slider封装
  • 原文地址:https://www.cnblogs.com/pyxiaomangshe/p/7723533.html
Copyright © 2011-2022 走看看