zoukankan      html  css  js  c++  java
  • Python 爬取 书籍

    ...

    import requests
    from bs4 import BeautifulSoup
    
    
    def gethtml(url,h):
        r = requests.get(url,headers=h)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    
    def getburl(r):
        soup = BeautifulSoup(r,'lxml')
        burls = []
        for url in soup.find_all(class_='top-tit'):
            base_u = url.p.a.attrs['href']
            b_url = 'http://www.jb51.net' + base_u
            burls.append(b_url)
        return burls
    def getbhtml(url):
        he = {
            'Host': 'www.jb51.net',
            'Referer': url,
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
        }
        r = requests.get(url, headers=he)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    
    def bookinfo(r):
        soup = BeautifulSoup(r, 'lxml')
        name = soup.find(class_='new2').h1.text
        downurl = []
        durls = soup.find(class_='content greena clearfix')
        for a in durls.find_all('a'):
            downurl.append(a.attrs['href'])
        return name,downurl
    
    if __name__ == "__main__":
    
        h = {
            'Host': 'www.jb51.net',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
        }
        # file = open('booksdown.txt', 'a')
        for page in range(1,4):
            url = 'http://www.jb51.net/books/list476_{}.html'.format(str(page))
            r = gethtml(url,h)
            burls = getburl(r)
            for burl in burls:
                br = getbhtml(burl)
                binfos = bookinfo(br)
                print(binfos)
                print('书籍原地址:{}'.format(burl))
                print('
    
    ')
        #         file.write(str(binfos))
        #         file.write('
    
    ')
        # file.close()
  • 相关阅读:
    chrome被篡改 导航到搜狗 或者特殊页面
    安装tomcat jdk
    监控tomcat 启动
    关于如何关闭445端口
    python模拟大数据登陆
    搭建vsftpd服务
    kali syn洪水攻击实例
    HP880G3 安装RHEL6.5
    Python_列表
    Python第一个请求接口
  • 原文地址:https://www.cnblogs.com/mysterious-killer/p/10157119.html
Copyright © 2011-2022 走看看