zoukankan      html  css  js  c++  java
  • crawler碎碎念4 关于python requests、Beautiful Soup库、SQLlite的基本操作

    Requests

    import requests
    
    from PIL import Image
    
    from io improt BytesTO
    
    import jason
    
    
    
    url = "..........."
    
    print(dir(requests)) #查看基本的用法
    
    r = requests.get(url)
    
    print(r.text)
    
    print(r.status_code)
    
    print(r.encoding)

    传递参数

    params = {'k1':'v1','k2':'v2','k3':[1,2,3],'k4':None}  #key的值是list的话就会一一赋值
    r = requests.get('http://httpbin.org/get',params)
    
    print(r.url)

    二进制数据

    r= requests.get('.........')
    
    image = Image.open(BytesTO(r.content))
    
    image.save('图片.jpg')

    json处理

    r = requests.get('https://github.com/timeline.jason')
    
    print(type(r.json))
    
    print(r.json)
    
    print(r.text)

    原始数据处理

    r= requests.get('.........')
    
    with open('图片.jpg','wb+') as f :
    
      for chunk in r.iter_content(1024):
    
        f.write(chunk)

    提交表单

    form = {‘username’:‘xxx’,'ddddefsc':'dscdsc'}
    
    r = requests.post('http://httpbin.org/post',data = form)
    
    r = requests.post('http://httpbin.org/post',data = jason.dumps(forms))
    
    print(r.text)
    
    
    
    cookies
    
    url ='xxxxxxxxxxxx'
    
    r = requests.get(url)
    
    cookies = r.cookies
    
    for k,v in cookies.get_dict().items():      标准的获取cookies
      print(k,,v)
    
    
    
    cookies = {'c1':'v1'}
    
    r = requests.get('http://httpbin.org/cookies',cookies= cookies)
    
    print(r.text)

    重定向和重定向历史   网站跳转的时候跟踪用

    r= requests.head('http://www.baidu.com',allow_redirects = True)
    
    print(r.url)
    
    print(r.status_code)
    
    print(r.history)

    代理

    proxies = {'http':'...','https:'.....'}          #可以用来科学上网嘻嘻
    
    r = requests.get('http://httpbin.org/cookies',proxies= proxies)

    Beautiful Soup

    from bs4 import BeautifulSoup
    #Tag
    soup = Beautifulsoup(open('test.html'))
    print(soup.prettify())
    print(soup.title.name)
    print(soup.title)
    #String
    print(type(soup.title.string))
    print(soup.title.string)
    #Comment注释
    print(type(soup.a.string))
    print(soup.a.name)
    
    for items in soup.body.contents:
        print(item.name)    
        #只找子元素的
    
    css查询
    print(soup.select('.sister'))    #返回到是数组
    print(soup.select('a')) 
    print(soup.select('#link''))     #从id开始找
    
    print(soup.select('head >title''))

    Htmlparser

    from HTMLParser import HTMLParser

    clase MyParser(HTMLParser):
      def handle_decl(self,decl):
        HTMLParser.handle_decl(self,decl)
        print('decl %s'% decl)

      def handle_starttag(self,tag,attrs):
        HTMLParser.handle_starttag(self,tag,attrs)
        print('<'+tag+'>')

      def handle_endtag(self,decl):
        HTMLParser.handle_endtag(self,decl)
        print('<'+tag+'>')
       def handle_data(self,data):
        HTMLParser.handle_data(self,data)
        print('data %s',data)
      def handle_startendtag(self,tag,attrs):
        HTMLParser.handle_startendtag(self,tag,attrs)
        print('<'+tag+ '>')
      def handle_comment(self,data):
        HTMLParser.handle_comment(self,data)
        print('data %s',data)

      def close(self):
        HTMLParser.close(self)
        print('Close')
    demo = MyParser()
    demo.feed(open('hello.html')).read()
    demo.close

     html格式的尽量不要用xml的方式去处理,因为html可能格式不完整

    sqlite3

    import sqlite3
    
    conn =sqlite3.connect('test.db')
    create_sql = 'create table company(id int primary key not null,emp_name text not null );'
    conn.execute(create_sql)
    insert_sql = 'insert into company values(?,?)'
    
    conn.execute(insert_sql,(100,'LY'))
    conn.execute(insert_sql,(200,'July'))
    cursors = conn.execute('select id,emp_name from company')
    for row in cursors:
        print(row[0],row[1])
    conn.close()

    mySQL

    需要指定mysql:host(ip/port),username,password,

    然后在插入数据后要记得使用conn.commit

  • 相关阅读:
    2020-03-03
    2020-03-02
    2020-03-01
    2020-02-29
    简单自我介绍
    福大软工1816 · 第六次作业
    福大软工1816 · 第五次作业
    python爬虫解决编码问题
    第四次作业-团队介绍
    福大软工1816 · 第三次作业
  • 原文地址:https://www.cnblogs.com/xingnie/p/9697418.html
Copyright © 2011-2022 走看看