zoukankan      html  css  js  c++  java
  • 【Python爬虫】之爬取页面内容、图片以及用selenium爬取

    下面不做过多文字描述:

    首先、安装必要的库

    # 安装BeautifulSoup
    pip install beautifulsoup4
    
    # 安装requests
    pip install requests

    其次、上代码!!!

    ①重定向网站爬虫h4文字

    import requests
    from bs4 import BeautifulSoup
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from PIL import Image
    
    
    # 重定向爬虫h4
    url = "http://www.itest.info/courses"
    soup = BeautifulSoup(requests.get(url).text,'html.parser')
    
    for courses in soup.find_all('p'):
        print(courses.text)
        print("
    ")

    ②v2ex爬取标题

    import requests
    from bs4 import BeautifulSoup
    
    # v2ex爬虫标题
    url = "https://www.v2ex.com"
    v2ex = BeautifulSoup(requests.get(url).text,'html.parser')
    
    for span in v2ex.find_all('span',class_='item_hot_topic_title'):
        print(span.find('a').text,span.find('a')['href'])
    
    for title in v2ex.find_all("a",class_="topic-link"):
        print(title.text,url+title["href"])

    ③煎蛋爬虫图片

    import requests
    from bs4 import BeautifulSoup
    
    
    
    headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
    }
    
    def download_file(url):
        '''下载图片'''
        print('Downding %s' %url)
        local_filename = url.split('/')[-1]
        # 指定目录保存图片
        img_path = "/Users/zhangc/Desktop/GitTest/project_Buger_2/Python爬虫/img/" + local_filename
        print(local_filename)
        r = requests.get(url, stream=True, headers=headers)
        with open(img_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    f.flush()
        return img_path
    
    url = 'http://jandan.net/drawings'
    soup = BeautifulSoup(requests.get(url, headers=headers).text, 'html.parser')
    
    def valid_img(src):
        '''判断地址符不符合关键字'''
        return src.endswith('jpg') and '.sinaimg.cn' in src
    
    for img in soup.find_all('img', src=valid_img):
        src = img['src']
        if not src.startswith('http'):
            src = 'http:' + src
        download_file(src)

    ④爬取知乎热门标题

    import requests
    from bs4 import BeautifulSoup
    
    headers ={
        "user-agent":"user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
    }
    url = "https://www.zhihu.com/explore"
    zhihu = BeautifulSoup(requests.get(url,headers=headers).text,"html.parser")
    for title in zhihu.find_all('a',class_="ExploreSpecialCard-contentTitle"):
        print(title.text)

    ⑤selenium爬虫知乎热门标题

    import requests
    from bs4 import BeautifulSoup
    
    
    # selenium爬虫
    url = "https://www.zhihu.com/explore"
    driver = webdriver.Chrome("/Users/zhangc/Desktop/GitTest/project_Buger_2/poium测试库/tools/chromedriver")
    driver.get(url)
    
    info = driver.find_element(By.CSS_SELECTOR,"div.ExploreHomePage-specials")
    for title in info.find_elements(By.CSS_SELECTOR,"div.ExploreHomePage-specialCard > div.ExploreSpecialCard-contentList > div.ExploreSpecialCard-contentItem > a.ExploreSpecialCard-contentTitle"):
        print(title.text,title.get_attribute('href'))
  • 相关阅读:
    Logger.getLogger与LogFactory.getLog
    log4j详解
    游戏史上80重要创新(原资料来自17173)
    软件开发工具介绍之 6.Web开发工具
    JAVA NIO 简介
    Alan Kay 你需要认识的一个天才
    大学计算机学习路线
    软件开发工具介绍之 5. 计划管理
    软件开发工具介绍之 4. 建模工具
    关于最近“361强奸360强奸QQ”,且是光天化日之下
  • 原文地址:https://www.cnblogs.com/Owen-ET/p/12229046.html
Copyright © 2011-2022 走看看