zoukankan      html  css  js  c++  java
  • 吴裕雄 python 爬虫(3)

    import hashlib
    
    md5 = hashlib.md5()
    md5.update(b'Test String')
    print(md5.hexdigest())

    import hashlib
    
    md5 = hashlib.md5(b'Test String').hexdigest()
    print(md5)

    import os
    import hashlib 
    import requests
    
    # url = "http://opendata.epa.gov.tw/ws/Data/REWXQA/?$orderby=SiteName&$skip=0&$top=1000&format=json"
    url = "https://www.baidu.com"
    
    # 读取网页原始码
    html=requests.get(url).text.encode('utf-8-sig')
    # 判断网页是否更新
    md5 = hashlib.md5(html).hexdigest()
    if os.path.exists('F:\pythonBase\pythonex\ch06\old_md5.txt'):
        with open('F:\pythonBase\pythonex\ch06\old_md5.txt', 'r') as f:
            old_md5 = f.read()
        with open('F:\pythonBase\pythonex\ch06\old_md5.txt', 'w') as f:
            f.write(md5)
    else:
        with open('F:\pythonBase\pythonex\ch06\old_md5.txt', 'w') as f:
            f.write(md5)
    
    if md5 != old_md5:
        print('数据已更新...') 
    else:
        print('数据未更新,从数据库读取...')

    import os
    import ast
    import hashlib
    import sqlite3
    import requests
    
    from bs4 import BeautifulSoup
    
    conn = sqlite3.connect('F:\pythonBase\pythonex\DataBasePM25.sqlite') # 建立数据库连接
    cursor = conn.cursor() # 建立 cursor 对象
    
    # 建立一个数据表
    sqlstr='''
    CREATE TABLE IF NOT EXISTS TablePM25 ("no" INTEGER PRIMARY KEY AUTOINCREMENT 
    NOT NULL UNIQUE ,"SiteName" TEXT NOT NULL ,"PM25" INTEGER)
    '''
    cursor.execute(sqlstr)
    
    url = "http://api.help.bj.cn/apis/aqilist/"
    #html=requests.get(url).text.encode('utf-8-sig')  # 读取网页原始码
    html=requests.get(url).text.encode('iso-8859-1').decode('utf-8-sig')
    # print(html)
    html = html.encode('utf-8-sig')
    # 判断网页是否更新
    md5 = hashlib.md5(html).hexdigest()
    old_md5 = ""
    
    if os.path.exists('F:\pythonBase\pythonex\ch06\old_md5-.txt'):
        with open('F:\pythonBase\pythonexch06\old_md5-.txt', 'r') as f:
            old_md5 = f.read()
    with open('F:\pythonBase\pythonexch06\old_md5-.txt', 'w') as f:
            f.write(md5)
    print("old_md5="+old_md5+";"+"md5="+md5)  #显示新老md5码进行观察
    if md5 != old_md5:
        print('数据已更新...')    
        sp=BeautifulSoup(html,'html.parser')   #解析网页内容
        jsondata = ast.literal_eval(sp.text)  #此时jscondata取到的是字典类型数据
        # 删除数据表内容
        js1=jsondata.get("aqidata")     #取出字典数据中的aqidata项的值(值是列表)
        conn.execute("delete from TablePM25")
        conn.commit()
        n=1
        for city in js1:   #city此时是列表js1中的第一条字典数据
            CityName=city["city"]  #取出city字典数据中的值为"city"的key         
            if(city["pm2_5"] == ""):
                PM25=0
            else:   #如果city字典中的key对应的value为空,则PM25=0,否则,把PM25=value  
                PM25=int(city["pm2_5"])             
            print("城市:{}   PM2.5={}".format(CityName,PM25))   #显示城市对应的名称与PM2.5值
            # 新增一笔记录
            sqlstr="insert into TablePM25 values({},'{}',{})" .format(n,CityName,PM25)
            cursor.execute(sqlstr)
            n+=1
            conn.commit() # 主动更新 
    else:
        print('数据未更新,从数据库读取...') 
        cursor=conn.execute("select *  from TablePM25")
        rows=cursor.fetchall()
        for row in rows:
            print("城市:{}   PM2.5={}".format(row[1],row[2]))    
    
    conn.close()  # 关闭数据库连  

    ........................................................

    import os
    import ast
    import hashlib
    import sqlite3
    import requests
    
    from bs4 import BeautifulSoup
    
    # cur_path=os.path.dirname(__file__) # 取得目前路径
    # print(cur_path)
    conn = sqlite3.connect('F:\pythonBase\pythonex\' + 'DataBasePM25.sqlite') # 建立数据库连接
    cursor = conn.cursor() # 建立 cursor 对象
    
    # 建立一个数据表
    sqlstr='''
    CREATE TABLE IF NOT EXISTS TablePM25 ("no" INTEGER PRIMARY KEY AUTOINCREMENT 
    NOT NULL UNIQUE ,"SiteName" TEXT NOT NULL ,"PM25" INTEGER)
    '''
    cursor.execute(sqlstr)
    
    url = "http://api.help.bj.cn/apis/aqilist/"
    # 读取网页原始码
    # html=requests.get(url).text.encode('utf-8-sig')
    html=requests.get(url).text.encode('iso-8859-1').decode('utf-8-sig')
    # print(html)
    html = html.encode('utf-8-sig')
    
    print('数据已更新...')    
    sp=BeautifulSoup(html,'html.parser')    #sp是bs4.Beautifulsoup类
    # 将网页内转换为 list,list 中的元素是 dict 
    jsondata = ast.literal_eval(sp.text)   #把sp.text字符串转为dict类型
    js=jsondata.get("aqidata")  #从jasondata中取出值为"aqidata"的key对应的value的列表
    
    # 删除数据表内容
    conn.execute("delete from TablePM25")
    conn.commit()
    
    #把抓到的数据逐条存到数据库
    n=1
    for city in js:
        CityName=city["city"]
        PM25=0 if city["pm2_5"] == "" else int(city["pm2_5"])     
        print("城市:{}   PM2.5={}".format(CityName,PM25))
        # 新增一条记录
        sqlstr="insert into TablePM25 values({},'{}',{})" .format(n,CityName,PM25)
        cursor.execute(sqlstr)
        n+=1
        conn.commit() # 主动更新  
    conn.close()  # 关闭数据库连  

    from time import sleep
    from selenium import webdriver
    
    urls = ['http://www.baidu.com','http://www.wsbookshow.com','http://news.sina.com.cn/']
    
    browser = webdriver.Chrome()
    browser.maximize_window
    for url in urls:
        browser.get(url) 
        sleep(3)
    
    browser.quit()
    from selenium import webdriver #导入webdriver
    
    url='http://www.wsbookshow.com/bookshow/jc/bk/cxsj/12442.html'  #以此链接为例
    browser=webdriver.Chrome()  #生成Chrome浏览器对象(结果是打开Chrome浏览器)
    browser.get(url)    #在浏览器中打开url
    login_form=browser.find_element_by_id("menu_1")   ##查找id="menu_1"的元素
    print(login_form.text)   #显示元素内容
    #browser.quit()   #退出浏览器,退出驱动程序
    username=browser.find_element_by_name("username")   #查找name="username"的元素
    print(username)
    password=browser.find_element_by_name("pwd")  #查找name="pwd"的元素
    print(password)
    login_form=browser.find_element_by_xpath("//input[@name='arcID']")
    print(login_form)
    login_form=browser.find_element_by_xpath("//div[@id='feedback_userbox']")
    print(login_form)
    continue_link=browser.find_element_by_link_text('新概念美语')
    print(continue_link)
    continue_link=browser.find_element_by_link_text('英语')
    print(continue_link)
    heading1=browser.find_element_by_tag_name('h1')
    print(heading1)
    content=browser.find_elements_by_class_name('topbanner')
    print(content)
    content=browser.find_elements_by_css_selector('.topsearch')
    print(content)
    # print(content.get_property)
    print()
    browser.quit()   #退出浏览器,退出驱动程序

    text = '中华'
    print(type(text))#<class 'str'>
    text1 = text.encode('gbk')
    print(type(text1))#<class 'bytes'>
    print(text1)#b'xd6xd0xbbxaa'
    text2 = text1.decode('gbk')
    print(type(text2))#<class 'str'>
    print(text2)#中华
    
    text4= text.encode('utf-8')
    print(type(text4))#<class 'bytes'>
    print(text4)#b'xe4xb8xadxe5x8dx8e'
    text5 = text4.decode('utf-8')
    print(type(text5))#<class 'str'>
    print(text5)#中华

    import requests
     
    url="http://www.baidu.com"
    response = requests.get(url)
    content = response.text.encode('iso-8859-1').decode('utf-8')
    #把网页源代码解码成Unicode编码,然后用utf-8编码
    print(content)

    from selenium import webdriver   # 导入webdriver模块
    
    chrome_obj = webdriver.Chrome()              # 打开Google浏览器
    chrome_obj.get("https://www.baidu.com")      # 打开 网址
    print(chrome_obj.title)

    from selenium import webdriver   # 导入webdriver模块
    
    chrome_obj = webdriver.Chrome()              # 打开Google浏览器
    
    chrome_obj.get(r"C:desktop	ext.html")      # 打开本地 html页面
  • 相关阅读:
    2019-2020-1 20199310《Linux内核原理与分析》第九周作业
    2019-2020-1 20199310《Linux内核原理与分析》第八周作业
    Android开发笔记(十七)——Fragment详解
    Android开发笔记(十六)——Activity的4种启动模式
    Android开发笔记(十五)——Activity的跳转和数据传递
    Android开发笔记(十四)——Activity的生命周期
    Android开发笔记(十三)——Activity的创建三部曲
    Android实战开发——News
    Android开发笔记(十二)——WebView
    Android开发笔记(十一)——ScrollView滚动视图
  • 原文地址:https://www.cnblogs.com/tszr/p/10150031.html
Copyright © 2011-2022 走看看