zoukankan      html  css  js  c++  java
  • 爬虫练习

    1.
    from bs4 import BeautifulSoup
    
    info = []
    with open('D:web1111/new_index.html','r') as wb_data:
        Soup = BeautifulSoup(wb_data,'lxml')
        images = Soup.select('body > div.main-content > ul > li > img')
        titles = Soup.select('body > div.main-content > ul > li > div.article-info > h3 > a')
        descs = Soup.select('body > div.main-content > ul > li > div.article-info > p.description')
        rates = Soup.select('body > div.main-content > ul > li > div.rate > span')
        cates = Soup.select('body > div.main-content > ul > li > div.article-info > p.meta-info')
      #  print (images,titles,descs,rates,cates)
    
    for title,image,desc,rate,cate in zip(titles,images,descs,rates,cates):
        data = {
            'title' :title.get_text(),
            'rate' :rate.get_text(),
            'desc' :desc.get_text(),
            'cate' :list(cate.stripped_strings),
            'image' :image.get('src')
        }
        info.append(data)
    
    
    for i in info:
        if float(i['rate'])>3:
            print(i['title'],i['cate'])
    
    '''
    body > div.main-content > ul > li:nth-child(1) > div.article-info > h3 > a
    body > div.main-content > ul > li:nth-child(1) > div.article-info > p.meta-info > span:nth-child(2)
    body > div.main-content > ul > li:nth-child(1) > div.rate > span
    body > div.main-content > ul > li:nth-child(1) > div.article-info > p.description
    body > div.main-content > ul > li:nth-child(1) > img
    '''


    2.
    # !/usr/bin/python
    #-*- coding: utf-8 -*-
    from bs4 import BeautifulSoup
    import requests
    
    
    url = 'http://bj.xiaozhu.com/fangzi/1508951935.html'
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text,'lxml')
    
    
    # 因为是单页面,使用 select 方法获得的元素又是一个列表,那么列表中的第一个元素且也是唯一一个元素即是我们要找的信息 用 “[0]” 索引将其取出
    # 后在对其使用处理的方法,因为 beautifulsoup 的些筛选方法并不能针对列表类型的元素使用 ;)
    
    
    title = soup.select('div.pho_info > h4')[0].text
    address = soup.select('div.pho_info > p')[0].get('title') # 和 get('href') 同理,他们都是标签的一个属性而已,我们只需要的到这个属性的内容即可
    price = soup.select('div.day_l > span')[0].text
    pic = soup.select('#curBigImage')[0].get('src')   # “#” 代表 id 这个找元素其实就是找他在页面的唯一
    
    host_name = soup.select('a.lorder_name')[0].text
    host_gender = soup.select('div.member_pic > div')[0].get('class')[0]
    
    # 请在此处打印并观察结果
    print(title)
    
    print(address)
    print(price)
    print(pic)
    
    print(host_name)
    print(host_gender)
    
    # 根据结果观察不同性别会用不同的图标样式(class),设计一个函数进行转换
    def print_gender(class_name):
        if class_name == 'member_ico1':
            return ''
        if class_name == 'member_ico':
            return ''
    
    
    data = {
        'title':title,
        'address':address,
        'price':price,
        'pic':pic,
        'host_name':host_name,
        'host_gender':print_gender(host_gender)
    
    }
    
    print(data)
    
    
    # -------------------补充------------------
    # 如何批量获取链接
    
    page_link = [] # <- 每个详情页的链接都存在这里,解析详情的时候就遍历这个列表然后访问就好啦~
    
    def get_page_link(page_number):
        for each_number in range(1,page_number): # 每页24个链接,这里输入的是页码
            full_url = 'http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(each_number)
            wb_data = requests.get(full_url)
            soup = BeautifulSoup(wb_data.text,'lxml')
            for link in soup.select('a.resule_img_a'): # 找到这个 class 样为resule_img_a 的 a 标签即可
                page_link.append(link)
    
    # ---------------------

    3.

    from bs4 import BeautifulSoup
    
    data = []
    path = './web/new_index.html'
    
    with open(path, 'r') as f:
        Soup = BeautifulSoup(f.read(), 'lxml')
        titles = Soup.select('ul > li > div.article-info > h3 > a')
        pics = Soup.select('ul > li > img')
        descs = Soup.select('ul > li > div.article-info > p.description')
        rates = Soup.select('ul > li > div.rate > span')
        cates = Soup.select('ul > li > div.article-info > p.meta-info')
    
    for title, pic, desc, rate, cate in zip(titles, pics, descs, rates, cates):
        info = {
            'title': title.get_text(),
            'pic': pic.get('src'),
            'descs': desc.get_text(),
            'rate': rate.get_text(),
            'cate': list(cate.stripped_strings)
        }
        data.append(info)
    
    for i in data:
        if len(i['rate']) >= 3:
            print(i['title'], i['cate'])

    4.

    # !/usr/bin/python
    #-*- coding: utf-8 -*-
    import requests
    from bs4 import BeautifulSoup
    def fit(url):
            base_url = url
            headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
            html = requests.get(base_url,headers=headers)
            soup = BeautifulSoup(html.text,'lxml')
            # print soup
            data = []
            # content = soup.select('body > div.zg-wrap.zu-main.clearfix > div.zu-main-content > div > ul > li > div.title > a')[0]
            # url_last = content.get('href')
            url_first = 'https://www.zhihu.com'
            # url_context = url_first+url_last
            def url_handle(url_last): ##排除掉专栏
                if url_last.startswith('/question'):
                    url_context = url_first + url_last
                    return url_context
                else:
                    url_context = url_last
                    return url_context
    
            titles = soup.select('div.zg-wrap.zu-main.clearfix > div.zu-main-content > div > ul > li > div.title > a')
            writes = soup.select('body > div.zg-wrap.zu-main.clearfix > div.zu-main-content > div > ul > li > div.content > div > div.entry-body > div.entry-meta > div > span.author-link-line > a')
            prizes = soup.select('body > div.zg-wrap.zu-main.clearfix > div.zu-main-content > div > ul > li > div.content > div > div.entry-left.hidden-phone > a')
            contents = soup.select('body > div.zg-wrap.zu-main.clearfix > div.zu-main-content > div > ul > li > div.title > a')
    
            for title,write,prize,content in zip(titles,writes,prizes,contents):
                url_last = content.get('href')
                url_context = url_handle(url_last)
                # url_context = url_first + url_last
                html_content = requests.get(url_context,headers=headers)
                soup = BeautifulSoup(html_content.text,'lxml')
                if url_context.startswith('https://www.zhihu.com'):
                    first_answer = soup.select('#zh-question-answer-wrap > div:nth-of-type(1) > div.zm-item-rich-text.expandable.js-collapse-body > div.zm-editable-content.clearfix')[0].get_text()
                else:
                    first_answer = 'no answer'
                info = {
                    'title': title.get_text(),
                    'write': write.get_text(),
                    'prize':  prize.get_text(),
                    'content': first_answer
                }
                data.append(info)
            for a in data:
                print 'title:'+a['title']
                print 'write:' + a['write']
                print 'prize:' + a['prize']
                print 'content:' + a['content']
    
    url = 'https://www.zhihu.com/search?type=content&q=健身'
    fit(url)
  • 相关阅读:
    java进阶知识--File类
    java进阶知识--函数式接口
    java进阶知识--Lambda表达式、递归
    java进阶知识--线程池
    java进阶知识--线程安全
    java进阶知识--多线程入门
    java基础知识--异常
    java基础知识--可变参数
    mysql中如何不重复插入满足某些条件的重复的记录的问题
    有关map中使用iterate迭代器遍历的不保序问题和list remove(object)的细节问题
  • 原文地址:https://www.cnblogs.com/tangbinghaochi/p/6307003.html
Copyright © 2011-2022 走看看