zoukankan      html  css  js  c++  java
  • 爬虫入门(一)

    1.jupyter的快捷键

    插入cell: a(向上) b(向下)
    删除: x
    执行 : shift+enter
    tab:补全
    cell模式切换:y,m
    shift+tab:打开帮助文档

    2.requests

    requests模块编写代码的流程
    1指定url
    2发起请求
    3获取相应对象中的数据
    4数据解析
    5持久化存储
    ## 爬取搜狗首页
    #1
    url = 'https://www.sogou.com/'
    #2 
    response = requests.get(url=url)
    #3
    page_text = response.text
    #4
    with open('./sougou.html','w',encoding='utf-8') as f:
        f.write(page_text)
     1 #需求:爬取搜狗指定词条搜索后的页面数据
     2 import requests
     3 
     4 url = 'https://www.sogou.com/web'
     5 # 封装参数
     6 wd = input('enter a word:')
     7 parma = {
     8     'query':wd
     9 }
    10 response = requests.get(url=url,params=parma)
    11 
    12 page_text = response.content
    13 
    14 filename = wd + '.html'
    15 
    16 with open(filename,'wb') as f:
    17     f.write(page_text)
    18     print('over')
    搜狗指定词条搜索
     1 # #爬取百度翻译结果
     2 import requests
     3 url = 'https://fanyi.baidu.com/sug'
     4 wd = input('enter a word')
     5 data = {
     6     'kw':wd
     7 }
     8 
     9 reponse = requests.post(url=url,data=data)
    10 
    11 print(reponse.json())
    12 # text:字符串
    13 # content: 二进制
    14 # json():对象
    爬取百度翻译结果
     1 # 爬取豆瓣电影分类排行榜 https://movie.douban.com/中的电影详情数据
     2 import requests
     3 url = 'https://movie.douban.com/j/chart/top_list'
     4 parma = {
     5     "type": "11",
     6     "interval_id": "100:90",
     7     "action": "",
     8     "start": "0",
     9     "limit": "20",
    10 }
    11 movie_data = requests.get(url=url,params=parma).json()
    12 print(movie_data)
    爬取豆瓣电影分类排行榜
     1 #需求:爬取国家药品监督管理总局中基于中华人民共和国化妆品生产许可证相关数据http://125.35.6.84:81/xk/
     2 import requests
     3 url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'
     4 headers = {
     5     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
     6 }
     7 id_list = []
     8 for page in range(1,2):
     9     data = {
    10         "on": "true",
    11         "page": str(page),
    12         "pageSize": "15",
    13         "productName":"",
    14         "conditionType": "1",
    15         "applyname":"",
    16         "applysn": "",
    17     }
    18     json_data = requests.post(url=url,data=data,headers=headers).json()
    19     for dic in json_data["list"]:
    20         id = dic["ID"]
    21         id_list.append(id)
    22 
    23 detail_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'
    24 for id in id_list:
    25     detail_data = {
    26         "id":id
    27     }
    28     detail_data = requests.post(url=detail_url,data=detail_data,headers=headers).json()
    29     print(detail_data)
    爬取国家药品监督管理总局中基于中华人民共和国化妆品生产许可证相关数据
     1 # 方式一
     2 import requests 
     3 url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1551194867510&di=bbf61d08b5497fa04a519267c3efb3ee&imgtype=0&src=http%3A%2F%2Fimg4.duitang.com%2Fuploads%2Fitem%2F201402%2F09%2F20140209170955_AiTUh.thumb.700_0.jpeg'
     4 img_data = requests.get(url=url,headers=headers).content
     5 with open('./bingzhang.jpg','wb') as f:
     6     f.write(img_data)
     7 
     8 # 方式二
     9 import urllib
    10 url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1551194867510&di=bbf61d08b5497fa04a519267c3efb3ee&imgtype=0&src=http%3A%2F%2Fimg4.duitang.com%2Fuploads%2Fitem%2F201402%2F09%2F20140209170955_AiTUh.thumb.700_0.jpeg'
    11 urllib.request.urlretrieve(url=url,filename='./liweier.jpg')
    爬取照片

    3.使用正则解析数据

    简单使用

    import re
    
    string = '''fall in love with you
    i love you very much
    i love she
    i love her'''
    
    re.findall('^i.*',string,re.M)  # M是一行一行
    
    
    #匹配全部行
    string1 = """细思极恐
    你的队友在看书
    你的敌人在磨刀
    你的闺蜜在减肥
    隔壁老王在练腰
    """
    re.findall('.*',string1,re.S)  # S全部

      案例

     1 import re
     2 import os
     3 import requests
     4 import urllib
     5 
     6 url = 'https://www.qiushibaike.com/pic/page/%s/?s=5170618'
     7 headers = {
     8     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
     9 }
    10 
    11 if not os.path.exists('./qiutu'):
    12     os.mkdir('./qiutu')
    13 
    14 start_page = int(input('enter a start pageNum:'))
    15 end_page = int(input('enter a end pageNum:'))
    16 
    17 for page in range(start_page, end_page + 1):
    18     new_url = format(url%page)
    19     page_text = requests.get(url=new_url,headers=headers).text
    20     img_url_list = re.findall('<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>',page_text,re.S)
    21     for img_url in img_url_list:
    22         img_url = 'https:' + img_url
    23         imgName = img_url.split('/')[-1]
    24         imgPath = 'qiutu/' + imgName
    25         urllib.request.urlretrieve(url=img_url,filename=imgPath)
    26         print(imgName,"下载成功")
    27 print("over!!!")
    爬取糗事百科中所有的图片进行保存

    4.bs4接续数据

    先安装: 

    pip install bs4
    pip install lxml

    解析原理:

      1.将即将要进行解析的源码加载到bs对象

      2.调用bs对象中相关的方法或着属性进行源码中的相关标签的定位  find(‘name’,class_="xxx") findall() select()

      3.将定位到的标签之间存在的文本或者属性值获取 string text get_text() a['href']

    案例:

     1 import requests
     2 from bs4 import BeautifulSoup
     3 
     4 url = 'http://www.shicimingju.com/book/sanguoyanyi.html'
     5 headers = {
     6     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
     7 }
     8 
     9 page_text = requests.get(url=url,headers=headers).text
    10 
    11 soup = BeautifulSoup(page_text,'lxml')
    12 
    13 a_list = soup.select('.book-mulu > ul > li > a')
    14 
    15 fp = open('sangou.txt','w',encoding='utf-8')
    16 
    17 for a in a_list:
    18     title = a.string
    19     detail_url = 'http://www.shicimingju.com' + a["href"]
    20     
    21     detail_page_text = requests.get(url=detail_url,headers=headers).text
    22     
    23     soup = BeautifulSoup(detail_page_text,'lxml')
    24     content = soup.find('div',class_='chapter_content').text
    25     
    26     fp.write(title + '
    ' + content)
    27     print(title,"x下载成功")
    28 print("over!")
    29 fp.close()
    View Code
  • 相关阅读:
    20150128-堆雪人
    20150127-梦里笑醒的声音
    20150126-渡口
    20150125-阴天
    FastAdmin 的上传代码在哪里?
    在 Linux 安装 IIS?
    FastAdmin env.sample 的用法
    可以方便配合 Git 的现代编辑器
    运算放大器复习
    Linux 权限使用 777 真的好吗?
  • 原文地址:https://www.cnblogs.com/qq849784670/p/10440488.html
Copyright © 2011-2022 走看看