zoukankan      html  css  js  c++  java
  • python练习---小脚本

    一.爬子域名

    #!/usr/bin/python 
    # -*- coding: utf-8 -*-
    
    import requests
    import re
    import sys
    def get(domain):
            url = 'http://i.links.cn/subdomain/'
    #        payload = ("domain=ycxy.com&b2=1&b3=1&b4=1")
            payload = ("domain={domain}&b2=1&b3=1&b4=1".format(domain=domain))
            r = requests.post(url=url,params=payload)
            con = r.text
            a =re.compile('value="(.+?)"><input')   #正则匹配引号里的任何字符,非贪婪
            result = a.findall(con)
            for i in result:
                print i
    
    if __name__ == '__main__':
        command =sys.argv[1:]                       #取所有后面的参数
        f ="".join(command)                          #用空格连接
        get(f)
    

    二.爬I春秋精华页标题

    #!/usr/bin/python 
    #coding=GBK
    
    
    import requests
    import re
    def gethtml():
        url = 'https://bbs.ichunqiu.com/portal.php'
        headers = {
            'Host': 'bbs.ichunqiu.com',
            'Connection': 'close',
            'Cache-Control': 'max-age=0',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.8',
        }
        r = requests.get(url=url,headers=headers)
        html = r.content
        title = re.findall(r'target="blank" class="ui_colorG" style="color: #555555;">(.*?)</a></h3>', html)
        for i in title:
            print i
    
        # return html
    s =gethtml()
    
    # a =re.findall(r'target="blank" class="ui_colorG" style="color: #555555;">(.*?)</a></h3>',s)
    # for i in a:
    #     print(i)
    

    三.爬妹子图片

    #!/usr/bin/python 
    # -*- coding: utf-8 -*-
    
    import requests,re,sys
    import urllib
    
    def getimg():
        for x in range(1,298):
            url = 'http://www.7160.com/xingganmeinv/list_3_'+str(x)+'.html'
            r =requests.get(url=url)
            con = r.content
            # result = re.findall(r'<span class="bom_z">(.*?)</span></a></li>',con)
            tu = re.findall(r'<img src="(.+?)" alt="',con)
            # for i in result:
            #     print i
            # for j in tu:
            #     print j
            xx = 0
            for n in tu:
                tu.append(n)
                urllib.urlretrieve(n,'d:/meinv/%s.jpg'%xx)
                xx=xx+1
    
    
    if __name__ == '__main__':
        getimg()
    

    三.百度URL采集

    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    
    import requests
    from bs4 import BeautifulSoup
    import sys
    import urllib3
    
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    
    
    headers={
    
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0",
    
        'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
    
        'Connection' : 'keep-alive',
    
        'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    
        'X-Forwarded-For':'120.239.169.74'
    }
    
    def url(key):
    
        for i in range(0,10,10):
            bd_search="https://www.baidu.com/s?word=%s=&pn=%s"% (key,str(i))
            # bd_search = "https://bbs.ichunqiu.com/thread-40592-1-1.html"
            r =requests.get(bd_search,headers=headers,verify=False,timeout=2)
            s= r.text
            # result = re.findall(r'.t > a',s)
            # print s.encode('utf-8')
            soup=BeautifulSoup(s,"lxml")
    
            url_list=soup.select(".t > a")   #对请求回来的内容进行查找,找出a标签里(URL链接)
            # print url_list
            for url in url_list:
                real_url=url['href']   #遍历循环,并且打印
                try:
                    r=requests.get(real_url,headers=headers,verify=False,timeout=2)  #再次请求
    
                    print(r.url)  #打印出URL链接
                    print key
                except Exception as e:
    
                    print(e)
    # url('sss')
    if __name__ == '__main__':
        command = sys.argv[1:]
        canshu = "".join(command)#加上参数
    
        url(canshu)
    
  • 相关阅读:
    《剑指offer》Q01-12 (牛客10.11)
    北京,我来了
    pods的好处
    iOS那些你从未使用过的属性方法
    重构一个运行超过10年的老项目
    C#泛型
    认真思考创建对象的几种方式
    2014年终总结
    前端优化:AMDclean去除requirejs
    webstorm常用快捷键及插件
  • 原文地址:https://www.cnblogs.com/hackxf/p/9062599.html
Copyright © 2011-2022 走看看