zoukankan      html  css  js  c++  java
  • 爬虫百度贴吧

    import requests
    from bs4 import BeautifulSoup
    import re
    
    def getHTMLText(url):
        try:
            r = requests.get(url)
            r.raise_for_status()
            r.encoding = 'utf-8'
            return r.text
        except:
            return "Error"
        
    def parsePage(html):
        infos = []
        soup = BeautifulSoup(html, 'lxml')
        li_tags = soup.find_all('li', class_=" j_thread_list clearfix")
        for li in li_tags:
            info = {}
            title = li.find('a', class_="j_th_tit ").string
            link = 'http://tieba.baidu.com' + li.find('a', class_="j_th_tit ")['href']
            author = li.find('span', class_=re.compile("tb_icon_author "))['title'].split(':')[-1].strip()
            time = li.find('span', class_="pull-right is_show_create_time").string
            reply_num = li.find('span', class_="threadlist_rep_num center_text").string
            info['title'] = title
            info['author'] = author
            info['time'] = time
            info['reply_num'] = reply_num
            info['link'] = link
            infos.append(info)
        return infos
    
    
    def ToFile(infos):
        with open('test.txt', 'a+', encoding='utf-8') as f:
            for info in infos:
                f.write('标题:{} 	 发帖人:{} 	 发帖时间:{} 	 回复:{} 	 链接:{} 
    '.format(
                        info['title'], info['author'], info['time'], info['reply_num'], info['link']))
        
    def main(base_url, deep):
        url_list = []
        for i in range(deep):
            url_list.append(base_url + '&pn=' + str(50*i))
        for url in url_list:
            html = getHTMLText(url)
            infos = parsePage(html)
            ToFile(infos)
            
    if __name__ == '__main__':
        base_url = 'http://tieba.baidu.com/f?kw=考研&ie=utf-8'
        deep = 4
        main(base_url, deep)
    

      

  • 相关阅读:
    5、include为应用指定多个struts配置文件
    4、struts处理流程和action的管理方式
    8、类型转换器
    7、请求参数接收
    UESTC 2014 Summer Training #6 Div.2
    Codeforces Round #FF
    css ul li去除圆点
    css a标签去除下划线
    Axure的热区元件的作用
    结组开发项目(TD学生助手)
  • 原文地址:https://www.cnblogs.com/key221/p/9530373.html
Copyright © 2011-2022 走看看