zoukankan      html  css  js  c++  java
  • [sh]uniq-sort-awk

    题目:[百度搜狐面试题] 统计url出现次数

    oldboy.log
    
    http://www.etiantain.org/index.html
    
    http://www.etiantain.org/1.html
    
    http://post.etiantain.org/index.html
    
    http://mp3.etiantain.org/3.html
    
    http://www.etiantain.org/1.html
    
    http://post.etiantain.org/2.html
    
    uniq - report or omit repeated lines

    去除相邻的重复的行

    [root@moban data]# cat ip.txt
    
    10.0.0.9
    
    10.0.0.8
    
    10.0.0.7
    
    10.0.0.7
    
    10.0.0.8
    
    10.0.0.8
    
    10.0.0.9
    
    [root@moban data]# uniq ip.txt
    
    10.0.0.9
    
    10.0.0.8
    
    10.0.0.7
    
    10.0.0.8
    
    10.0.0.9
    
    让重复的行相邻
    
    [root@moban data]# sort ip.txt
    
    10.0.0.7
    
    10.0.0.7
    
    10.0.0.8
    
    10.0.0.8
    
    10.0.0.8
    
    10.0.0.9
    
    10.0.0.9
    
    [root@moban data]# sort ip.txt |uniq
    
    10.0.0.7
    
    10.0.0.8
    
    10.0.0.9
    
    [root@moban data]# sort -u ip.txt
    
    10.0.0.7
    
    10.0.0.8
    
    10.0.0.9
    
    -u, --unique
    
    with -c, check for strict ordering; without -c, output only the
    
    first of an equal run
    
    [root@moban data]# sort ip.txt |uniq -c
    
    2 10.0.0.7
    
    3 10.0.0.8
    
    2 10.0.0.9
    
    uniq:-c 计数
    
    -c, --count
    
    prefix lines by the number of occurrences
    
    [root@moban data]# awk -F / '{print $3}' url.txt
    
    www.etiantain.org
    
    www.etiantain.org
    
    post.etiantain.org
    
    mp3.etiantain.org
    
    www.etiantain.org
    
    post.etiantain.org

    解答:

    [root@moban data]# awk -F / '{print $3}' url.txt|sort|uniq -c
    
    1 mp3.etiantain.org
    
    2 post.etiantain.org
    
    3 www.etiantain.org
    
    降序排序:
    
    法1:
    
    [root@moban data]# awk -F / '{print $3}' url.txt|sort|uniq -c|sort -r
    
    3 www.etiantain.org
    
    2 post.etiantain.org
    
    1 mp3.etiantain.org
    
    法2:cut
    
    [root@moban data]# cut -d / -f3 url.txt |sort|uniq -c|sort -r
    
    3 www.etiantain.org
    
    2 post.etiantain.org
    
    1 mp3.etiantain.org
    
    优化:
    
    [root@moban data]# cut -d / -f3 url.txt |sort -r|uniq -c
    
    3 www.etiantain.org
    
    2 post.etiantain.org
    
    1 mp3.etiantain.org
    
    排序:
    
    sort –rn
    
    [root@lanny test]# cat ip.txt
    
    10.0.0.9 o
    
    10.0.0.9 a
    
    10.0.0.8 z
    
    10.0.0.8 k
    
    10.0.0.8 c
    
    10.0.0.7 n
    
    10.0.0.7 f
    
    对第二列排序
    
    -t 分隔符 –k 第几列
    
    [root@lanny test]# sort -t " " -k2 ip.txt
    
    10.0.0.9 a
    
    10.0.0.8 c
    
    10.0.0.7 f
    
    10.0.0.8 k
    
    10.0.0.7 n
    
    10.0.0.9 o
    
    10.0.0.8 z
    
    分隔符默认是空格,因此 –t 可以省略
    
    [root@lanny test]# sort -k2 ip.txt
    
    [root@lanny test]# sort -rk2 ip.txt #倒序排列
    
    -t 表示按点号分隔域
    
    类似awk的-F,取字段用$1 $2或cut的-d,取字段f数字.
    
    sort –runtk
    
    -r 倒序 –u 去重 –n数字 -t分隔 –k 第几行
    
    uniq –c
    
    题目:要求对ip的第三列降序排序,如果第三列相同,那就第四列按照降序排序.
    
    [root@lanny test]# cat arp.txt
    
    192.168.0.3 00:e0:4c:41:d2:a5
    
    192.168.2.2 00:e0:4c:41:d1:7d
    
    192.168.3.7 00:50:bf:11:94:60
    
    192.168.3.5 00:e0:4c:43:a3:46
    
    192.168.2.4 00:0a:eb:6d:08:10
    
    192.168.1.2 00:01:6c:99:37:47
    
    192.168.4.9 00:0a:e6:b5:d1:4b
    
    192.168.0.4 00:0e:1f:51:74:24
    
    192.168.6.7 00:1d:72:40:b2:e1
    
    192.168.8.4 00:01:6c:36:5d:64
    
    192.168.1.22 00:e0:4c:41:ce:73
    
    192.168.0.15 00:e0:4c:41:d7:0e
    
    192.168.2.9 00:e0:4c:41:d1:8b
    
    192.168.0.122 00:16:ec:c5:46:45
    
    192.168.9.115 00:01:6c:98:f7:07
    
    192.168.7.111 00:17:31:b6:6e:a9
    
    sort -t. -k3.1,3.1nr -k4.1,4.3nr arp.txt
    
    -k多少列
    
    -k3.1,3.3 第三列第一个字符到第三列第一个字符
    
    -k4.1,4.3 第四列第一个字符,第四列第三个字符
    
    [root@lanny test]# sort -t. -k3.1,3.1nr -k4.1,4.3nr arp.txt
    
    192.168.9.115 00:01:6c:98:f7:07
    
    192.168.8.4 00:01:6c:36:5d:64
    
    192.168.7.111 00:17:31:b6:6e:a9
    
    192.168.6.7 00:1d:72:40:b2:e1
    
    192.168.4.9 00:0a:e6:b5:d1:4b
    
    192.168.3.7 00:50:bf:11:94:60
    
    192.168.3.5 00:e0:4c:43:a3:46
    
    192.168.2.9 00:e0:4c:41:d1:8b
    
    192.168.2.4 00:0a:eb:6d:08:10
    
    192.168.2.2 00:e0:4c:41:d1:7d
    
    192.168.1.22 00:e0:4c:41:ce:73
    
    192.168.1.2 00:01:6c:99:37:47
    
    192.168.0.122 00:16:ec:c5:46:45
    
    192.168.0.15 00:e0:4c:41:d7:0e
    
    192.168.0.4 00:0e:1f:51:74:24
    
    192.168.0.3 00:e0:4c:41:d2:a5
    
    题目:[百度搜狐面试题] 统计url出现次数 ---awk解决
    
    oldboy.log
    
    http://www.etiantain.org/index.html
    
    http://www.etiantain.org/1.html
    
    http://post.etiantain.org/index.html
    
    http://mp3.etiantain.org/3.html
    
    http://www.etiantain.org/1.html
    
    http://post.etiantain.org/2.html
    
    数组:
    
    [root@lanny test]# awk 'BEGIN{array[1]="lanny";array[2]="oldlanny";for(key in array) print key,array[key]}'
    
    1 lanny
    
    2 oldlanny
    
    t2.awk
    
    #!/bin/awk
    
    BEGIN{
    
    array[1]="lanny"
    
    array[2]="oldlanny"
    
    for(key in array)
    
    print key,array[key]
    
    }
    
    解析:begin定义,表示初始化数组
    
    [root@lanny test]# awk -f t2.awk
    
    1 lanny
    
    2 oldlanny
    
    [root@lanny test]# ./t2.awk #加了权限后可以这样执行
    
    -f 从文件读
    
    另一种方式:
    
    提供BEGIN和END的作用是给程序赋予初始状态和在程序之后执行一些扫尾的工作.
    
    任何在BEGIN之后列出的操作(在{}内)将在awk开始扫描输入之前执行,而END之后列出的操作将在扫描完全部的输入之后执行.因此,通常使用BEGIN来显示变量和预置(初始化)变量,使用END来输出最终结果.
    
    将数组输出
    
    [root@lanny test]# awk 'BEGIN{array[1]="lanny";array[2]="oldlanny";}END{for (key in array) print key,array[key]}' /etc/hosts #没什么实在意义,只不过写法需要数据流, begin 初始化,end 处理.
    
    1 lanny
    
    2 oldlanny
    
    [root@lanny test]#cat /etc/hosts | awk 'BEGIN{array[1]="lanny";array[2]="oldlanny";}END{for (key in array) print key,array[key]}'
    
    将文件内容输出为数组
    
    [root@lanny test]# awk 'BEGIN{array[1]="lanny";array[2]="oldlanny";}END{for (key in array) print key,array[key]}' /etc/hosts > awk.log
    
    [root@lanny test]# cat awk.log
    
    1 lanny
    
    2 oldlanny
    
    把第一列做为下标,第二列做为值输出.放入S[]输出
    
    [root@lanny test]# awk '{S[$1]=$2}END{for(k in S) print k,S[k]}' awk.log
    
    1 lanny
    
    2 oldlanny
  • 相关阅读:
    hdu 5726 GCD
    codeforces 982C Cut 'em all!
    codeforces 982B Bus of Characters
    codeforces 982A Row
    codeforces 983B XOR-pyramid
    codeforces 979D Kuro and GCD and XOR and SUM
    codeforces 983A Finite or not?
    codeforces 984B Minesweeper
    codeforces 979C Kuro and Walking Route
    codeforces 979B Treasure Hunt
  • 原文地址:https://www.cnblogs.com/iiiiher/p/5330458.html
Copyright © 2011-2022 走看看