zoukankan      html  css  js  c++  java
  • base大家族详解

    base大家族详解

    base大家族详解

    1 简介

    对于base编码的使用,参考Base What? A Practical Introduction to Base Encoding

    • base2: 2进制
    • base10: 10进制
    • base16: 计算机字节表示
    • base32: 字母加数字
    • base64: 更大的表示范围
    • base58: bitcoin为了比base64更易读采用的表示

    这些base编码主要是在计算机的二进制表示,和人可读的字符表示之间转换。

    具体实现可以分为2种,能利用bit位直接转换的base(例如base2, base16,base32, base64),和不能利用bit位直接转换的base(例如base10, base58)。

    2 实现代码

    全部采用clojure实现,主要是为了理解base编码,只求代码容易理解,性能不是很好:

    (ns base)
    
    (defn indices-of
      [f coll]
      (keep-indexed #(if (f %2) %1 nil) coll))
    
    (defn first-index-of
      [f coll]
      (first (indices-of f coll)))
    
    (defn find-thing
      "查找`coll`中第一个等于value的索引"
      [value coll]
      (first-index-of #(= % value) coll))
    
    (def divmod "返回[商 余数]" (juxt quot mod))
    
    ;; 1. 不能利用bit位直接转换的base
    ;; 基本编码方法就是把输入作为一个数字,不断的对baseN的N进行除法运算,
    ;; 每次的余数就是base-table的索引,商作为下一次的除数进行计算。直到除尽
    
    (defn base-encode-num
      "base编码一个数字`n`
      `leading-zeros`为padding个数
      `base-table`为编码转换表"
      [n leading-zeros base-table]
      (let [base-count (count base-table)]
        (loop [n n
               result ""]
          (if (>= n base-count)
            (let [[d m] (divmod n base-count)]
              (recur d
                    ;; 编码是从后往前进行的,每次除以baseN的N,
                    ;; 以余数作为索引
                    (-> (nth base-table m)
                        (str result))))
            (->> (if (pos? n)
                   (-> (nth base-table n)
                       (str result))
                   result)
                 ;; 注意,因为是从后往前进行生成的,padding在前面添加
                 (concat (repeat leading-zeros (first base-table)))
                 (apply str))))))
    
    (defn base-enc
      "编码base10,base36,base58等"
      [base-table s]
      (let [s-bytes (.getBytes s)
            leading-zeros (->> s-bytes
                               (take-while zero?)
                               count)]
        (-> (java.math.BigInteger. 1 s-bytes)
            (base-encode-num leading-zeros base-table))))
    
    (defn invert-table
      "生成`table`序列的反向索引表 "
      [table]
      (into {}
            (map #(vector %1 %2)
                 table
                 (iterate inc 0))))
    
    ;; baseN的解码方法就是对输入字符串的每个字符查找索引 * (N ** 输入串反向顺序的索引),
    ;; 结果累加就是目标数字,再把数字还原为字符串
    
    (defn base-dec
      "解码base10,base36,base58等"
      [base-table s]
      (let [padding (->> s
                         (take-while #(= % (first base-table)))
                         (map (constantly (byte 0))))
            inverted-table (invert-table base-table)
            base-count (count base-table)]
        (->> (reverse s)
             (reduce (fn [[r m] c]
                       [(+ r (*' m (inverted-table c)))
                        (*' m base-count)]) [0 1])
             first
             str
             java.math.BigInteger.
             .toByteArray
             (drop-while zero?)
             (concat padding)
             byte-array
             String.)))
    
    (def base58-table "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz")
    (def enc-base58 (partial base-enc base58-table))
    (def dec-base58 (partial base-dec base58-table))
    
    (def base36-table "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ")
    (def enc-base36 (partial base-enc base36-table))
    (def dec-base36 (partial base-dec base36-table))
    
    (def base10-table "0123456789")
    (def enc-base10 (partial base-enc base10-table))
    (def dec-base10 (partial base-dec base10-table))
    
    (comment
      (def target "D9cS9N9iHjMLTdA8YSMRMp")
    
      (def r (dec-base58 target))
    
      (= target (enc-base58 r))
    
      (= "64021609848" (enc-base36 "testabc"))
    
      (enc-base10 "this is a test")
      ;; => "2361031878030638688519054699098996"
    
      (dec-base10 "2361031878030638688519054699098996")
      ;; => "this is a test"
    
      )
    
    ;;; ============= 以2为基的base算法,base2, base4, base16, base32, base64 ...
    ;;; 可以利用bit位直接变换的base
    ;;; 算法把输入的8bit串转换为base(2^n)的nbit串,不需要做除法,
    ;;; 编码过程 直接把输入字符串转换为bit序列,再根据baseN的位宽(如base64 6位,base32 5位)重新划分,
    ;;;  查表获取目标串,不需要除法,因此要比上面的base算法速度快
    ;;; 下面统一称为base2
    
    (defn bits
      "转换数字为bit序列,如果不指定`width`,默认为8位"
      ([n] (bits n 8))
      ([n width]
            (reverse (map #(bit-and (bit-shift-right n %) 1) (range width)))))
    
    (defn numb
      "bit序列转换为数字"
      [bits]
      (BigInteger. (apply str bits) 2))
    
    (defn string-to-bits
      "字符串转换为bit序列"
      [msg]
      (->> (.getBytes msg)
           (map #(bits %))
           flatten))
    
    (defn padding-count
      "获取字节序列`v`的padding个数"
      [v]
      (->> (vec v)
           rseq
           (take-while zero?)
           count))
    
    (defn drop-padding
      "去掉字节序列`v`的padding"
      [v]
      (-> (padding-count v)
          (drop-last v)))
    
    (defn log2
      [n]
      (/ (Math/log n)
         (Math/log 2)))
    
    (defn get-bit-width
      "获取`base-table`的位宽,即用多少字节表示一个字符"
      [base-table]
      (let [width (-> (count base-table)
                      log2)]
        (when (= width (Math/floor width))
          (int width))))
    
    (defn valid-base2-table?
      "检测是否为有效的base2表"
      [base-table]
      (-> (get-bit-width base-table)
          nil?
          not))
    
    (defn byte-align
      "根据位宽bit-width,获取需要对齐的base字节数
      比如一个字节是8位,转换base64,每个base64字节为6位
      byte-align就是确定最低需要多少个base64的6位字节才能对齐到8位字节"
      [bit-width]
      (let [max-padding 12
            fn-len-range (fn [width]
                           (map (partial * width) (range 1 max-padding)))
            hex-range (fn-len-range 8)
            bit-range (fn-len-range bit-width)
            align-value (->> (clojure.set/intersection (set hex-range) (set bit-range))
                             (apply min))]
        (/ align-value bit-width)))
    
    (defn hex-byte-align
      "获取最低需要多少个hex字节才能对齐"
      [bit-width]
      (-> (byte-align bit-width)
          (* bit-width)
          (/ 8)))
    
    (comment
    
      (byte-align 6)
      ;; => 4
    
      (hex-byte-align 6)
      ;; => 3
      ;; base64需要4个6位字节才能对齐到3个8位hex字节,即 4 * 6 = 3 * 8
    
      (byte-align 5)
      ;; => 8
    
      (hex-byte-align 5)
      ;; => 5
      ;; base32需要8个5位字节才能对齐到5个8位hex字节, 即 8 * 5 = 5 * 8
    
      (hex-byte-align 4)
      ;; => 1
      (hex-byte-align 2)
      ;; => 1
      (hex-byte-align 1)
      ;; => 1
      ;; 可以看到base2, base4,base16 可以对齐到1个字节,不需要padding
    
      )
    
    (defn gen-padding
      "生成`data`的padding串"
      [bit-width data]
      (let [len (count data)
            align (byte-align bit-width)
            aligned-byte (mod len align)]
        (if (zero? aligned-byte)
          ""
          (apply str
                 (-> (- align aligned-byte)
                     (repeat "="))))))
    
    (defn base2-enc
      "编码base2类型的字符串`s`
      `base-table`必须是以2为基的base表,base(2^n),例如base16,base32,base64"
      [base-table s]
      {:pre [(valid-base2-table? base-table)]}
      (let [bits (string-to-bits s)
            base2-char #(->> (numb %)
                             (nth base-table))
            bit-width (get-bit-width base-table)
            data (partition bit-width bit-width (repeat 0) bits)]
        (str (->> (map base2-char data)
                  (apply str))
             (gen-padding bit-width data))))
    
    (defn base2-dec
      "base2解码,`s`为要解码的字符串
      `base-table` 解码表"
      [base-table s]
      {:pre [(valid-base2-table? base-table)]}
      (let [bit-width (get-bit-width base-table)
            inverted-table (merge (invert-table base-table)
                                  {= 0} ;; 添加padding字符
                                  )]
        (->> (mapcat #(some-> (inverted-table %1)
                              (bits bit-width))
                     s)
             ;; mapcat自动过滤掉base2-bits的nil值,即忽略s串中不属于base-table的字符
             (partition 8 8 (repeat 0))
             (map numb)
             drop-padding
             (map char)
             (apply str))))
    
    ;; base64,base32 需要使用base2-enc dec
    (def base64-table "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/")
    (def enc-base64 (partial base2-enc base64-table))
    (def dec-base64 (partial base2-dec base64-table))
    
    (def base32-table "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567")
    (def enc-base32 (partial base2-enc base32-table))
    (def dec-base32 (partial base2-dec base32-table))
    
    ;; base16等同于hex转换
    (def base16-table "0123456789ABCDEF")
    (def enc-base16 (partial base2-enc base16-table))
    (def dec-base16 (partial base2-dec base16-table))
    
    ;; base2是2进制表示
    (def base2-table "01")
    (def enc-base2 (partial base2-enc base2-table))
    (def dec-base2 (partial base2-dec base2-table))
    
    ;; 自动定义base的宏
    (defmacro defbase
      "根据base-table生成base编码和解码函数的定义"
      [base-name base-table]
      (let [table-name (symbol (str base-name "-table"))
            enc-fname (symbol (str "enc-" base-name))
            dec-fname (symbol (str "dec-" base-name))
            def-table `(def ~table-name ~base-table)]
        (if-let [bit-width (get-bit-width base-table)]
          `(do ~def-table
               (def ~enc-fname (partial base2-enc ~base-table))
               (def ~dec-fname (partial base2-dec ~base-table)))
          `(do ~def-table
               (def ~enc-fname (partial base-enc ~base-table))
               (def ~dec-fname (partial base-dec ~base-table))))))
    
    ;; (defbase base64 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/")
    ;; (defbase base32 "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567")
    ;; (defbase base16 "0123456789ABCDEF")
    ;; (defbase base2 "01")
    ;; (defbase base58 "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz")
    ;; (defbase base36 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ")
    ;; (defbase base10 "0123456789")
    
    (defbase base8 "01234567")
    
    (comment
    
      base8-table
      ;; => "01234567"
    
      (enc-base8 "test 1")
      ;; => "3506256335020061"
    
      (dec-base8 *1)
      ;; => "test 1"
    
      (= 2 (padding-count [1 2 0 3 0 0]))
    
      (= 0 (padding-count [1 2 3]))
    
      (= 0 (padding-count []))
    
      (= [1 2 0 3] (drop-padding [1 2 0 3 0 0]))
    
      (= "I" (base2-dec base64-table "SQ=="))
    
      (= "AM" (base2-dec base64-table "QU0="))
    
      (base2-enc base64-table "I")
      ;; => "SQ=="
    
      (base2-enc base64-table "AM")
      ;; => "QU0="
    
      (String. (base64/encode "I"))
      ;; => "SQ=="
    
      (String. (base64/encode "AM"))
      ;; => "QU0="
    
      (= "this is a test" (base2-dec base32-table "ORUGS4ZANFZSAYJAORSXG5A="))
    
      (base32/encode "this is a test")
      ;; => "ORUGS4ZANFZSAYJAORSXG5A="
    
      (base2-enc base32-table "this is a test")
      ;; => "ORUGS4ZANFZSAYJAORSXG5A="
    
      ;; base-table不符,产生异常
      (= "base58_is_boring" (base2-dec base58-table "D9cS9N9iHjMLTdA8YSMRMp"))
    
      (= 6 (get-bit-width base64-table))
    
      (= 5 (get-bit-width base32-table))
    
      (= nil (get-bit-width base58-table))
    
      (enc-base32 "I") ;; base32的padding比较多
      ;; => "JE======"
    
      (dec-base32 *1)
      ;; => "I"
    
      ;; base16相当于内存hex表示
      (enc-base16 "this is a test")
      ;; => "7468697320697320612074657374"
    
      (dec-base16 "7468697320697320612074657374")
      ;; => "this is a test"
    
      (dec-base16 "7468IIOO697320697320612074657374") ;; 不合法字符直接忽略
      ;; => "this is a test"
    
      (codecs/bytes->hex (codecs/str->bytes "this is a test"))
      ;; => "7468697320697320612074657374"
    
      ;; base2相当于内存的二进制表示
      (enc-base2 "t")
      ;; => "01110100"
    
      ;; 	的二进制
      (Integer/toString (int 	) 2)
      ;; => "1110100"
    
      (dec-base2 "01110100")
      ;; => "t"
      ;; 注意解码时前面的0是必要的
    
      (def s1 "this is a test")
      (= (enc-base64 s1) (String. (base64/encode s1)))
    
      (= (enc-base32 s1) (base32/encode s1))
    
      )
    
    ;;; ============== 自动base解码
    
    (defn valid-base-str?
      "检查字符串s是否符合base-table"
      [base-table s]
      (clojure.set/subset? (set s)
                           (-> (set base-table)
                               (conj =))))
    
    (defn guess-base
      [text]
      (cond
        ;; 这里必须按从小到大的顺序测试
        (valid-base-str? base16-table text) :base16
        (valid-base-str? base32-table text) :base32
        (valid-base-str? base64-table text) :base64
        :else :unknown))
    
    (defn decode
      "自动base解码"
      [text & {:keys [step debug]
               :or {step 1
                    debug false}}]
      (loop [text text
             step step]
        (when debug
          (println "step" step "decode:" text))
        (case (guess-base text)
          :base16 (do (println "step" step "--> base16")
                      (-> (dec-base16 text)
                          (recur (inc step))))
          :base32 (do (println "step" step "--> base32")
                      (-> (dec-base32 text)
                          (recur (inc step))))
          :base64 (do (println "step" step "--> base64")
                      (-> (dec-base64 text)
                          (recur (inc step))))
          :unknown (do (println "result:" text)
                       text))))
    
    
    (comment
      (def text "3441353234343435353735323442353634423539354134353336353333323536343735323444343535333537344234433441343634353535353535323533343734413335344134363439353534423530344135413437353634463444353334463441344534443435333235363533353534423531354134363431353334413335")
    
      ;; 经测试,直接转换bits对于长字符串速度比较慢
      (decode text)
      ;; step 1 --> base16
      ;; step 2 --> base16
      ;; step 3 --> base32
      ;; step 4 --> base32
      ;; step 5 --> base64
      ;; step 6 --> base64
      ;; result: key:hi!venus
      ;; => "key:hi!venus"
    
      )
    

    3 结论

    base编码基本是码表转换,产生人可读的字符表示,不适合加密。

    作者: ntestoc

    Created: 2019-04-11 四 10:28

  • 相关阅读:
    Hihocoder #1014 : Trie树
    Codevs 1298 凸包周长
    洛谷 P1355 神秘大三角(计算几何基础)
    Cogs 1688. [ZJOI2008]树的统计Count(树链剖分+线段树||LCT)
    Poj 2887 Big String(块状数组)
    P1449 后缀表达式
    P3478 [POI2008]STA-Station
    P1533 可怜的狗狗
    P2073 送花
    P1014 Cantor表
  • 原文地址:https://www.cnblogs.com/ntestoc/p/10673669.html
Copyright © 2011-2022 走看看