zoukankan      html  css  js  c++  java
  • # Pyspark全角半角符号数据格式化转换UDF函数

    Pyspark全角半角符号数据格式化转换UDF

    import findspark
    
    findspark.init()
    import pyspark
    from pyspark import SparkContext
    from pyspark.sql import SQLContext
    from pyspark.sql.functions import *
    from pyspark.sql.types import IntegerType, FloatType, StringType
    
    sc = SparkContext()
    sqlContext = SQLContext(sc)
    
    
    # 全角转成半角
    def full2half(s):
        n = ''
        for char in s:
            num = ord(char)
            if num == 0x3000:  # 将全角空格转成半角空格
                num = 32
            elif 0xFF01 <= num <= 0xFF5E:  # 将其余全角字符转成半角字符
                num -= 0xFEE0
            num = chr(num)
            n += num
        return n
    
    
    df = sc.parallelize([['DBD', '迪布达'],
                         ['GBD', '迪布达(中国)'],
                         ['GBD', '迪布达(中国)'],
                         ['GBD', '迪布达(中国)'],
                         ]).toDF(['code', 'org'])
    full_half = udf(full2half, StringType())
    df = df.withColumn('org', full_half(df.org))
    df.show()
    # output :
    # +----+---------------+
    # |code|fullToHalf(org)|
    # +----+---------------+
    # | DBD|         迪布达|
    # | GBD|   迪布达(中国)|
    # | GBD|   迪布达(中国)|
    # | GBD|   迪布达(中国)|
    # +----+---------------+
    sqlContext.udf.register(name="fullToHalf", f=full_half)
    df.createOrReplaceTempView("t1")
    sqlContext.sql(
        """
            SELECT code,fullToHalf(org) FROM T1
        """
    ).show()
    # output :
    # +----+---------------+
    # |code|fullToHalf(org)|
    # +----+---------------+
    # | DBD|         迪布达|
    # | GBD|   迪布达(中国)|
    # | GBD|   迪布达(中国)|
    # | GBD|   迪布达(中国)|
    # +----+---------------+
    
  • 相关阅读:
    uva10256
    uva11168
    zoj2318
    hdu6121
    hdu6127
    bzoj3957: [WF2011]To Add or to Multiply
    bzoj4377: [POI2015]Kurs szybkiego czytania
    bzoj3137: [Baltic2013]tracks
    bzoj4069: [Apio2015]巴厘岛的雕塑
    bzoj4169: Lmc的游戏
  • 原文地址:https://www.cnblogs.com/pandaa/p/15102048.html
Copyright © 2011-2022 走看看