zoukankan      html  css  js  c++  java
  • Python文本和字符串常用操作

    ## 字符串分割

     1 line = "This is my love!"
     2 fields = line.split(' ')
     3 print(fields)
     4 # ['This', 'is', 'my', 'love!']
     5 
     6 # 多条件分割
     7 import re
     8 
     9 
    10 line = "asd dfwerf, sdfs; jtyy. werwe, sdfsd"
    11 fields = re.split(r"(,|.|;|s)s*", line)
    12 print(fields)
    13 # ['asd', ' ', 'dfwerf', ',', 'sdfs', ';', 'jtyy', '.', 'werwe', ',', 'sdfsd']
    14 
    15 # 如果不需要分隔符可以使用(?:...)不捕获分组
    16 fields = re.split(r"(?:,|.|;|s)s*", line)
    17 print(fields)
    18 # ['asd', 'dfwerf', 'sdfs', 'jtyy', 'werwe', 'sdfsd']

    ## 开头或结尾匹配startswith(), endswith()

     1 url = 'https://www.baidu.com'
     2 print(url.startswith('https'))
     3 # True
     4 
     5 words = ['sd', 'asad', 'fgwer', 'jtdwse', 'qwieu', 'pqwej', 'ejwqi', 'iaweq']
     6 w = [word for word in words if word.startswith(('a', 'e', 'i'))]
     7 # startswith(),endswith()方法接收一个元组表示多项匹配,元组的括号不可以省略
     8 print(w)
     9 # ['asad', 'ejwqi', 'iaweq']
    10 
    11 # 判断某个目录下是否有某些类型的文件
    12 if any(name.endswith(('.c', '.h')) for name in listdir(dirname)):
    13     pass

    ## Shell通配符(*?[0-9])匹配

     1 from fnmatch import fnmatch, fnmatchcase
     2 
     3 
     4 print(fnmatch('hello.py', '*.py'))
     5 # True
     6 print(fnmatch('hello.py', '?ello.py'))
     7 # True
     8 print(fnmatch('hello-1.py', 'hello-[0-9].py'))
     9 # True
    10 print(fnmatch('hello.txt', '*.TXT'))
    11 # fnmatch()在不同操作系统中结果不同
    12 # 若在类UNIX系统中为False(即大小写敏感),在Windows系统中为True(即大小写不敏感)
    13 
    14 print(fnmatchcase('hello.txt', '*.TXT'))
    15 # False
    16 # fnmatchcase()在任何操作系统中都大小写敏感

    ## 字符串匹配和搜索

     1 import re
     2 
     3 
     4 date = '7/24/2018'
     5 # 匹配
     6 print(bool(re.match(r'd+/d+/d+', date)))
     7 # True
     8 
     9 # 同一模式多次使用可以预先编译
    10 date_pat = re.compile(r'd+/d+/d+')
    11 print(bool(date_pat.match(date)))
    12 # True
    13 
    14 # 查找并捕获
    15 date_pat = re.compile(r'(d+)/(d+)/(d+)')
    16 line = "Today is 7/24/2018, tomorrow is 7/25/2018"
    17 print(date_pat.findall(line))
    18 # [('7', '24', '2018'), ('7', '25', '2018')]
    19 
    20 m = date_pat.match(date)
    21 print(m)
    22 print(m.group())
    23 # 7/24/2018
    24 print(m.group(1))
    25 # 7
    26 print(m.group(2))
    27 # 24
    28 print(m.group(3))
    29 # 2018
    30 print(m.group(0))
    31 # 7/24/2018

    ## 字符串的修改和替换

     1 line = "Yes, it's me!"
     2 print(line.replace("Yes", "Yeah"))
     3 # Yeah, it's me!
     4 
     5 import re
     6 
     7 
     8 text = "Today is 7/24/2018, tomorrow is 7/25/2018"
     9 date_pat = re.compile(r'(d+)/(d+)/(d+)')
    10 print(date_pat.sub(r'3-1-2', text))   # 其中r'3-1-2'指的是通过正则捕获的数据的索引,和上面group()方法的对应关系相同
    11 # Today is 2018-7-24, tomorrow is 2018-7-25
    12 
    13 
    14 # sub()方法还可以接受一个函数作为参数以应对更加复杂的替换
    15 from calendar import month_abbr
    16 
    17 
    18 def change_date(m):  # 参数为match对象,即match()或者find()方法的返回值
    19     mon_name = month_abbr[int(m.group(1))]
    20     return "{} {} {}".format(m.group(2), mon_name, m.group(3))
    21 
    22 print(date_pat.sub(change_date, text))
    23 # Today is 24 Jul 2018, tomorrow is 25 Jul 2018
    24 
    25 # 查看subn()替换的数量
    26 print(date_pat.subn(change_date, text)) # 返回一个元组,第二个值为替换的次数
    27 # ('Today is 24 Jul 2018, tomorrow is 25 Jul 2018', 2)

    ## 搜索忽略大小写

     1 import re
     2 
     3 
     4 text = "UPPER PYTHON, lower python, Mixed Python"
     5 print(re.findall('python', text, flags=re.IGNORECASE))
     6 # ['PYTHON', 'python', 'Python']
     7 print(re.sub('python', 'java', text, flags=re.IGNORECASE))  # 替换并不会按照原有规则,而是所有匹配项都同一替换
     8 # UPPER java, lower java, Mixed java
     9 
    10 # 可以使用辅助函数弥补
    11 def matchcase(word):
    12     def replace(m): # 参数为一个match对象
    13         text = m.group()
    14         if text.isupper():
    15             return word.upper()
    16         if text.islower():
    17             return word.lower()
    18         if text[0].isupper(): # 首字母大写
    19             return word.title()
    20         return word
    21     return replace
    22 
    23 print(re.sub('python', matchcase('java'), text, flags=re.IGNORECASE))
    24 # UPPER JAVA, lower java, Mixed Java

    ## 贪婪匹配和最短匹配

    1 import re
    2 
    3 
    4 text = 'You said "Yes", I said "No"'
    5 print(re.findall(r'".*"', text))  # 贪婪匹配,匹配结果尽可能长
    6 # ['"Yes", I said "No"']
    7 
    8 print(re.findall(r'".*?"', text)) # 最短匹配, 匹配结果尽可能短
    9 # ['"Yes"', '"No"']

    ## 多行匹配

     1 import re
     2 
     3 
     4 comment = re.compile(r'/*(.*?)*/')    # 此模式无法匹配多行, 因为 . 号无法匹配换行符
     5 text1 = "/* this is a comment */"
     6 text2 = """
     7     /* this is a
     8     multiline comment */
     9 """
    10 print(comment.findall(text1))
    11 # [' this is a comment ']
    12 
    13 print(comment.findall(text2))
    14 # []
    15 
    16 comment = re.compile(r'/*((?:.|
    )*?)*/') # 在模式中加上换行符,可以匹配多行
    17 print(comment.findall(text1))
    18 
    19 
    20 print(comment.findall(text2))
    21 # [' this is a
        multiline comment ']
    22 
    23 comment = re.compile(r'/*(.*?)*/', flags=re.DOTALL) # flags=re.DOTALL使 . 号可以匹配所有字符
    24 print(comment.findall(text1))
    25 # [' this is a comment ']
    26 print(comment.findall(text2))
    27 # [' this is a
        multiline comment ']

    ## 删除多余字符

     1 text = "---Hello   World+++"
     2 print(text.strip("-+")) # 参数默认为 ‘ ’ 空格, 只能清除两侧的多余内容,字符串中间的内容无法清除
     3 # Hello   World
     4 print(text.lstrip("-")) # 清除左侧多余字符
     5 # Hello   World+++
     6 print(text.rstrip("+")) # 清除右侧多余字符
     7 # ---Hello   World
     8 
     9 text = "Hello  World"
    10 print(text.replace('  ', ' ')) # 只能清除/替换固定个数的字符
    11 # Hello World
    12 
    13 import re
    14 
    15 
    16 text = "Hello     World"
    17 print(re.sub('s+', ' ', text))  # 可以清除/替换不定个数的字符
    18 # Hello World

    ## 字符串对齐

     1 text = "Hello World"
     2 print(text.ljust(20, '-')) #v 第一个参数为总字符数, 第二个参数为填充字符,默认为空格
     3 # Hello World---------
     4 print(text.rjust(20))
     5 #          Hello World
     6 print(text.center(20, '-'))
     7 # ----Hello World-----
     8 
     9 print(format(text, '>20'))
    10 #          Hello World
    11 
    12 print(format(text, '-<20'))
    13 # Hello World---------
    14 
    15 print(format(text, '+^20'))
    16 # ++++Hello World+++++
    17 
    18 print(format(3.14159265, '-^10.4f'))  # 格式化数字
    19 # --3.1416--

    ## 字符串中插入变量

     1 text = 'Hello, {name}!'
     2 print(text.format(name="Stanley"))
     3 # Hello, Stanley!
     4 
     5 text = '{name} has {n} message(s).'
     6 name = "Stanley"
     7 n = 32
     8 print(text.format_map(vars())) # 从全局变量中查找相应数据
     9 # Stanley has 32 message(s).
    10 
    11 class UserInfo:
    12     def __init__(self, name, n):
    13         self.name = name
    14         self.n = n
    15 
    16 a = UserInfo('Stanley', 30)
    17 print(text.format_map(vars(a))) # 从实例属性中查找数据
    18 # Stanley has 30 message(s).

      

        - 定义一个类包装输入,避免变量找不到的问题

    1 class safesub(dict):
    2     def __missing__(self, key): # 重写__missing__()方法
    3         return "{" + key + "}"
    4 
    5 text = '{name} has {n} message(s).'
    6 name = "Stanley"
    7 print(text.format_map(safesub(vars())))
    8 # Stanley has {n} message(s).

    ## 文本换行

     1 import textwrap
     2 
     3 text = "Python is an interpreted high-level programming language for general-purpose programming. Created by Guido van Rossum and first released in 1991, Python has a design philosophy that emphasizes code readability, notably using significant whitespace. It provides constructs that enable clear programming on both small and large scales. In July 2018, the creator Guido Rossum stepped down as the leader in the language community after 30 years."
     4 print(textwrap.fill(text, 40))
     5 """
     6 Python is an interpreted high-level
     7 programming language for general-purpose
     8 programming. Created by Guido van Rossum
     9 and first released in 1991, Python has a
    10 design philosophy that emphasizes code
    11 readability, notably using significant
    12 whitespace. It provides constructs that
    13 enable clear programming on both small
    14 and large scales. In July 2018, the
    15 creator Guido Rossum stepped down as the
    16 leader in the language community after
    17 30 years.
    18 """
    19 print(textwrap.fill(text, 80, initial_indent="      "))  # 首行缩进
    20 """
    21       Python is an interpreted high-level programming language for general-
    22 purpose programming. Created by Guido van Rossum and first released in 1991,
    23 Python has a design philosophy that emphasizes code readability, notably using
    24 significant whitespace. It provides constructs that enable clear programming on
    25 both small and large scales. In July 2018, the creator Guido Rossum stepped down
    26 as the leader in the language community after 30 years.
    27 """
    28 print(textwrap.fill(text, 80, subsequent_indent="      "))  # 悬挂缩进
    29 """
    30 Python is an interpreted high-level programming language for general-purpose
    31       programming. Created by Guido van Rossum and first released in 1991,
    32       Python has a design philosophy that emphasizes code readability, notably
    33       using significant whitespace. It provides constructs that enable clear
    34       programming on both small and large scales. In July 2018, the creator
    35       Guido Rossum stepped down as the leader in the language community after 30
    36       years.
    37 """

    参考资料:
      Python Cookbook, 3rd edition, by David Beazley and Brian K. Jones (O’Reilly).

  • 相关阅读:
    luogu 1593
    luogu 1369
    hdu 1796
    bzoj 3398
    luogu 4587
    luogu 2152
    bzoj 3629
    bzoj 1507: [NOI2003]Editor
    bzoj 1503: [NOI2004]郁闷的出纳员
    bzoj 1497: [NOI2006]最大获利
  • 原文地址:https://www.cnblogs.com/hycstar/p/9361901.html
Copyright © 2011-2022 走看看