zoukankan      html  css  js  c++  java
  • 爬吉他谱

    写个小爬虫,趴一趴吉他谱
    # -*- coding: utf-8 -*- #coding=UTF8 import os import sys import logging import urllib import urllib2 import chardet import re import cookielib import urlparse from bs4 import BeautifulSoup sysEncoding = sys.getfilesystemencoding() cookieJar = cookielib.CookieJar() def get(url): req = urllib2.Request(url) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar)) response = opener.open(req) return response.read() def download_guitar_image(url, target): print 'start download guitar image ...' req = urllib2.Request(url) req.add_header('Accept','image/webp,image/*,*/*;q=0.8') opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar)) response = opener.open(req) content = response.read() with open(target, 'wb') as code: code.write(content) #解析吉他谱图片页面链接地址 def parse_guitar_img_link(): page_list = [] url_base = 'http://www.17jita.com/' page = 1 while True: url = url_base + 'tab/img/index.php?page=' + str(page) print url html = get(url) soup = BeautifulSoup(html, "html5lib") list = soup.select('#ct dl > dt > a') if not list: break for item in list: page_list.append({ 'title' : item.text, 'link' : url_base + item['href'] }) page += 1 return page_list def download_guitar_image_link_list(url): image_link_list = [] page = 1 while True: page_url = url if page > 1: page_url = url.replace('.html', '' + str(page) + '.html') try: html = get(page_url) soup = BeautifulSoup(html, 'html5lib') img_list = soup.select('#article_contents a > img') for img in img_list: image_link_list.append(img['src']) except urllib2.URLError, e: msg = u'下载 ' + page_url + u' 出错, 原因: ' + e.reason print msg logging.error(msg) break page += 1 return image_link_list if __name__ == '__main__': logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S', filename='guitar.log', filemode='a') path = 'guitar' if not os.path.exists(path): os.mkdir(path) page_list = parse_guitar_img_link() for page in page_list: print page['link'] + '(' + page['title'] + ')' guitar_path = path + '/' + (page['title']).encode('GBK') if not os.path.exists(guitar_path): os.mkdir(guitar_path) image_link_list = download_guitar_image_link_list(page['link']) for image_link in image_link_list: print ' ' + image_link filename = image_link[image_link.rindex('/'):] filepath = guitar_path + filename.encode('GBK') download_guitar_image(image_link, filepath)

      

    小蟒蛇
  • 相关阅读:
    table中tr间距的设定table合并单元格 colspan(跨列)和rowspan(跨行)
    使用jquery触发a标签跳转
    真正的让iframe自适应高度 兼容多种浏览器随着窗口大小改变
    html5 data属性的使用
    jQuery取得select选择的文本与值
    jqueryui教程
    密码复杂度
    zabbix配置微信报警
    tomcat配置域名访问
    阿里云ecs禁止ping,禁止telnet
  • 原文地址:https://www.cnblogs.com/pyxiaomangshe/p/7723533.html
Copyright © 2011-2022 走看看