zoukankan      html  css  js  c++  java
  • 搜韵网诗词采集

    搜韵网诗词采集,会封ip 建议使用代理采集

    import pymongo
    import requests
    from pyquery import PyQuery as pq
    from urllib.parse import urljoin
    
    
    class Poetry:
        def __init__(self):
            self.start_url = 'https://sou-yun.cn/PoemIndex.aspx'
            self.comment_url = 'https://api.sou-yun.cn/api/Poem?jsonType=true&includeLinks=true&key={}'
            self.headers = {
                "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
            }
            self.client = pymongo.MongoClient(host='localhost', port=27017).Poetry['Poetry']
            self.dynasty_dict = dict()
            self.run()
    
        def get_response(self, url):
            """单独设置请求,便于添加代理"""
            response = requests.get(url=url, headers=self.headers)
            return response
    
        def get_dynasty_content(self):
            """获取朝代的链接地址,每一个朝代的信息"""
            dynasty_response = self.get_response(self.start_url)
            doc = pq(dynasty_response.content.decode('utf8'))
            dynasty_cate_doc = doc('.inline1').items()
            for dynasty_doc in dynasty_cate_doc:
                dynasty_name = dynasty_doc('a').text()
                dynasty_url = urljoin(self.start_url, dynasty_doc('a').attr('href'))
                self.dynasty_dict[dynasty_name] = dynasty_url
    
        def get_poetry(self, dynasty, person_name, person_url):
            """获取每一个朝代的历史人物的诗词,翻页采用递归的方式"""
            poetry_response = self.get_response(person_url)
            doc = pq(poetry_response.content.decode('utf8'))
            poetry_doc_list = doc('._poem').items()
            for poetry_doc in poetry_doc_list:
                poetry_id = poetry_doc.attr('id').replace('poem_', '')
                poetry_title = poetry_doc('.poemCommentLink').text()
                title_comment = poetry_doc('.titleComment').text()
                poetry_content = poetry_doc('.poemSentence').text().strip()
                poetry_comment_doc = poetry_doc('.poemComment')
                poetry_comment = ''
                if poetry_comment_doc:
                    comment_url = self.comment_url.format(poetry_id)
                    comment_response = self.get_response(comment_url).json()
                    comments = comment_response['ShiData'][0]['Comments']
                    for comment in comments:
                        book = comment['Book']
                        content = comment['Content'].replace('<br />', '
    ')
                        poetry_comment += book + '' + '
    ' + content + '
    '
                poetry_type = poetry_doc('.titleIndent').text()
                poetry_note = poetry_doc('.poemNote').text()
                poetry_dict = dict()
                poetry_dict['朝代'] = dynasty
                poetry_dict['作者'] = person_name
                poetry_dict['标题'] = poetry_title
                poetry_dict['标题注释'] = title_comment
                poetry_dict['类型'] = poetry_type
                poetry_dict['内容'] = poetry_content
                poetry_dict['评注'] = poetry_comment
                poetry_dict['注释'] = poetry_note
                self.client.insert_one(poetry_dict)
    
            #  翻页逻辑
            next_page_doc_list = doc('#content>div:last-child a').items()
            next_page_url = ''
            for next_page_doc in next_page_doc_list:
                if '下一页' in next_page_doc.text():
                    next_page_url_doc = next_page_doc.attr('href')
                    next_page_url = urljoin(self.start_url, next_page_url_doc)
            if next_page_url:
                self.get_poetry(dynasty, person_name, next_page_url)
    
        def get_person_content(self):
            """根据朝代获取每一个朝代的名人的链接地址"""
            for dynasty, dynasty_url in self.dynasty_dict.items():
                person_response = self.get_response(dynasty_url)
                doc = pq(person_response.content.decode('utf8'))
                person_doc_list = doc('.inline1').items()
                for person_doc in person_doc_list:
                    person_name = person_doc('a').text()
                    person_url = urljoin(self.start_url, person_doc('a').attr('href'))
                    self.get_poetry(dynasty, person_name, person_url)
    
        def run(self):
            self.get_dynasty_content()
            self.get_person_content()
    
    
    if __name__ == '__main__':
        Poetry()

    代码未设置代理

  • 相关阅读:
    【转载】高内聚低耦合
    【转载】locate命令的使用
    【转载】C内存对齐
    【原创】_INTSIZEOF 内存按照int对齐
    【转载】free查看内存
    Hive查询Join
    Hive数据查询
    Hive导入数据
    Hive表的修改Alter
    Hive中排序和聚集
  • 原文地址:https://www.cnblogs.com/lqn404/p/13840385.html
Copyright © 2011-2022 走看看