zoukankan      html  css  js  c++  java
  • 一个死去的网站shige.laiyo.com

    2017年4月份的时候,研一下刚刚开始。
    爬了这个网站,现在这个网站已经关闭了,这些爬虫代码也就没用了,面向特定网站爬虫本身就是没有意义的。

    爬author

    import requests
    from pyquery import PyQuery as jq
    import re
    import json
    
    ar = []
    for i in range(1, 3132):
        url = "http://shige.laiyo.com/zuozhe_%dA1.aspx" % i
        print("requesting", url)
        resp = requests.get(url)
        print(resp.status_code, "response over")
        if "网页发生错误" in resp.text:
            print("没有", i, "这首诗")
            continue
        resp.encoding = "utf8"
        html = jq(resp.text)
        img = re.search("http://img.gushiwen.org/authorImg/.*?.jpg", resp.text)
        if img:
            img = img.group()
        name = html(".left .title h1 span").text().strip()
        name = name[:name.index("的")]
        desc = html(".left .sons").eq(0).find(".cont").text().strip()
        author = {"id": i, "img": img, "description": desc, 'name': name}
        ar.append(author)
        # print(json.dumps(author, indent=1, ensure_ascii=0))
        # input()
    json.dump(ar, open("author2.json", "w", encoding="utf8"), ensure_ascii=0, indent=1)
    
    

    爬词牌

    import requests
    import re
    from  pyquery import PyQuery as pq
    import json
    
    url = "http://www.meili999.com/cipai/index.html"
    url2 = "http://www.meili999.com/cipai/index_2.html"
    urls = []
    for url in (url, url2):
        resp = requests.get(url)
        resp.encoding = 'utf8'
        html = pq(resp.text).find("#content .list")
        resp = re.findall("http://www.meili999.com/cipai/d+/", html.html())
        urls += resp
    data = []
    for url in urls:
        resp = requests.get(url)
        resp.encoding = 'utf8'
        html = pq(resp.text)
        ci = dict()
        ci['title'] = html(".pageTitle").text()
        desc = html(".poem_comm p")
        txt = ""
        for p in desc:
            txt += pq(p).text().strip() + "
    "
        ci['description'] = txt
        print(ci['title'])
        data.append(ci)
    json.dump(data, open("cipai.json", "w", encoding="utf8"), ensure_ascii=0, indent=1)
    
    

    爬诗歌

    from pprint import pprint
    
    import requests
    from pyquery import PyQuery as jq
    import re
    import json
    
    
    def filt(s):
        if not s: return None
        s = re.sub("<br.*?>", "
    ", s)
        s = re.sub("&.*?;", "", s)
        s = re.sub("<.*?>", "", s)
        s = s.strip()
        return s
    
    
    def part(son):
        partName = son.find(".cont p").eq(0).text()
        return {partName: filt(jq(son).find(".cont").html())}
    
    
    def mainPart(son):
        cont = son(".cont")
        source = son(".source")
        title = cont("p").eq(0).text().strip()
        preface = None
        dynasty = source("a").eq(1).text()
        author = source('a').eq(0).text()
        poemContent = ""
        for i in cont("#contson p"):
            span = jq(i).find('span')
            if span:
                preface = filt(span.html())
            else:
                poemContent += filt(jq(i).html()) + "
    "
        poemContent = poemContent.strip()
        if not poemContent:
            poemContent = filt(son("#contson").html())
        return {'title': title,
                'content': poemContent,
                'preface': preface,
                'dynasty': dynasty,
                'author': author
                }
    
    
    def parseHtml(html):
        sons = html(".sons")
        d = mainPart(sons.eq(0))
        related = []
        for i in range(1, sons.size()):
            if sons.eq(i).attr('id'):
                related.append(int(sons.eq(i).attr('id')[4:]))
                print(related[-1], '=====')
                print(sons.eq(i))
                input()
            else:
                d = {**d, **part(sons.eq(i))}
        d = {**d, "related": related}
        return d
    
    
    ar = []
    for i in range(1, 73225):
        url = "http://shige.laiyo.com/view_%s.aspx" % i
        print("requesting", url)
        resp = requests.get(url)
        print(resp.status_code, "response")
        resp.encoding = "utf8"
        if "网页发生错误" in resp.text:
            print("没有", i, "这首诗")
            continue
        open("haha.html", 'w',encoding='utf8').write(resp.text)
        html = jq(resp.text)
        poem = {'id': i, **parseHtml(html)}
        ar.append(poem)
        pprint(poem)
        input()
    json.dump(ar, open("poem.json", "w", encoding='utf8'), ensure_ascii=0, indent=1)
    
    

    爬类型

    import requests
    from pyquery import PyQuery as jq
    import re
    import json
    
    ar = []
    for i in range(1, 774):
        href = "http://shige.laiyo.com/leixing_%dA%d.aspx"
        url = href % (i, 1)
        print("requesting", url)
        resp = requests.get(url)
        print("response over", resp.status_code)
        resp.encoding = "utf8"
        html = jq(resp.text)
        category = html('.left .title h1 span').text()
        poems = []
        for j in range(1, 0xffffff):
            url = href % (i, j)
            print("requesting", url)
            resp = requests.get(url)
            print("response over", resp.status_code)
            shige = re.findall("#shiged+", resp.text)
            if not shige:
                break
            for k in shige:
                poems.append(int(re.search("d+", k).group()))
        d = {"name": category, "poems": poems}
        ar.append(d)
    json.dump(ar, open("type.json", "w", encoding="utf8"), indent=1, ensure_ascii=0)
    
    
  • 相关阅读:
    package.json和bower的参数解释
    移动端<meta>属性配置讲解(整理)
    PHP Ajax 跨域问题最佳解决方案
    svn客户端的使用
    TotoiseSVN的基本使用方法
    网页设计入门——UCASiGEM前端组寒假培训笔记
    manacher算法笔记
    【luoguP1168】中位数
    【CF848B】 Rooter's Song
    【luoguP1382】楼房
  • 原文地址:https://www.cnblogs.com/weiyinfu/p/9300663.html
Copyright © 2011-2022 走看看