zoukankan      html  css  js  c++  java
  • Python 爬虫学习

    #coding:utf-8
    #author:Blood_Zero
    
    '''
        1、获取网页信息
        2、解决编码问题,通过charset库(默认不安装这个库文件)
    '''
    import urllib
    import urllib2
    
    url = "http://192.168.1.135/myself/"
    html = urllib.urlopen(url)
    content = html.read()
    print content
    #如果网页中存在其他编码,就会出现乱码
    #print content.decode('gbk').encode('utf-8')
    
    
    '''
        简易获取网页信息
    '''
    #获取当前url
    print "当前URL:"+str(html.geturl())
    
    #网页状态码
    print "当前状态码:"+str(html.code)
    #print "当前状态码:"+str(html.getcode())
    
    #网站头信息
    print "当前头信息:
    "+str(html.headers)
    #print "当前头信息:
    "+str(html.info())
    
    #获取网站编码
    print "当前网站使用编码:"+str(html.info().getparam("charset"))
    
    #下载网页源码
    urllib.urlretrieve(url,"E:\Python_Code\pyTools\url.txt")
    
    
    '''
        模拟浏览器访问网址
    '''
    #方法一
    req=urllib2.Request(url)
    # 添加头信息
    req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.2; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0")
    req.add_header("Get",url)
    req.add_header("Host","192.168.1.135")
    
    new_html = urllib2.urlopen(req)
    print new_html.read()
    print req.headers.items()
    
    #方法二
    myheader={
        "User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0",
        "Host":"192.168.1.135",
        "Get":url
    }
    req1 = urllib2.Request(url,headers=myheader)
    new_html_1 = urllib2.urlopen(req1)
    print new_html_1.read()
    print req1.headers.items()
    
    
    '''
        在网页中查询指定文件
    '''
    def get_content(url):
        html = urllib.urlopen(url)
        content = html.read()
        html.close()
        return content
    
    def get_file(self):
        #匹配php文件
        regex = r'a href=(.+?.php)'
        pat=re.compile(regex)
    
        file_code = re.findall(pat,self)
        print str(file_code)+"
    "
    
    info = get_content("http://192.168.1.135/myself/SQL_Injection/")
    get_file(info)
  • 相关阅读:
    导出查询结果到excle
    导出所选行为excle
    spring security LDAP获取用户信息
    spring security防御会话伪造session攻击
    Linux安装Loadrunner generator
    Centos7 安装gitlab
    kafka 安装部署
    zookeeper 搭建
    Oracle GoldenGate对接 Oracle 11g和Kafka
    suse 11 sp4 设置yast 安装源
  • 原文地址:https://www.cnblogs.com/BloodZero/p/4648891.html
Copyright © 2011-2022 走看看