zoukankan      html  css  js  c++  java
  • python 实用小技巧

    1. 列表

    #以下三式等价
    c = (a>b and a or b)   
    c = a if a>b else b
    c = [b, a][a>b]
    

    字符串拼接

    ' + '.join('%s,%s'%(a,b) for a,b in zip(list('abc'), range(3)))
    

    'a,0 + b,1 + c,2'

    字符查找

    'aa' in '大家aa'           #True
    '大家aa'.find('大家')    #0
    

    for循环

    from itertools import product
    result = product([1,2,3], ['a','b'])
    #[(1, 'a'), (1, 'b'), (2, 'a'), (2, 'b'), (3, 'a'), (3, 'b')]
    

    数组重构,np.newaxis

    np.arange(5)[:,None]
    

    array([[0],
    [1],
    [2],
    [3],
    [4]])

    数组拼接, np.r_[], np.c_[],注意是方括号

    np.r_[np.array([1,2,3]), np.array([1,2,3])]
    #array([1, 2, 3, 1, 2, 3])
    
    np.c_[np.array([1,2,3]), np.array([1,2,3])]
    #array([[1, 1],
    #       [2, 2],
    #       [3, 3]])
    
    np.r_[np.array([[1,2,3],[1,2,3]]), np.array([[4,5,6],[4,5,6]])]
    #array([[1, 2, 3],
    #       [1, 2, 3],
    #       [4, 5, 6],
    #       [4, 5, 6]])
    
    np.c_[np.array([[1,2,3],[1,2,3]]), np.array([[4,5,6],[4,5,6]])]
    #array([[1, 2, 3, 4, 5, 6],
    #       [1, 2, 3, 4, 5, 6]])
    

       

    2. 嵌套列表展开,嵌套方法可实现多层列表展开

    a=[[1,2],[3,4],[5,6]]
    print([j for i in a for j in i])  #列表生成式
    
    sum(a, [])  #求和函数
    
    from functools import reduce
    reduce(lambda x,y: x+ y, a)  #工具函数
    
    from itertools import chain
    print(list(chain(*a))) 
    
    import itertools
    print(list(itertools.chain.from_iterable(a)))
    
    t=[]
    [t.extend(i) for i in a]
    print(t)
    
    #递归的方法比较容易理解
    #在stackoverflow看到大牛的列表生成式版本
    func = lambda x: [y for l in x for y in func(l)] if type(x) is list else [x]
    
    #递归自定义
    def expand_list(nested_list):
        for item in nested_list:
            if isinstance(item, (list, tuple)):
                for sub_item in expand_list(item):
                    yield sub_item
            else:
                yield item
    

      

    3. 工具函数

    # map 把函数 f 依次作用在 list 的每个元素上,得到一个 iterator 并返回。
    print(list(map(format_name, ['adam', 'LISA', 'barT'])))
    
    # reduce()传入的函数 f 必须接收两个参数,reduce()对list的每个元素反复调用函数f,并返回最终结果值。reduce()还可以接收第3个可选参数,作为计算的初始值。
    from functools import reduce
    print(reduce(f, [1, 3, 5, 7, 9], 100))
    
    # filter()根据判断结果自动过滤掉不符合条件的元素,返回由符合条件元素组成的iterator。
    print(list(filter(is_odd, [1, 4, 6, 7, 9, 12, 17])))
    
    #zip()参数可以接受任何类型的序列,同时也可以有两个以上的参数;当传入参数的长度不同时,zip能自动以最短序列长度为准进行截取,获得元组
    [i for i in zip(['a','b'], (3,4,5))]
    

      

    4. 时间

    python 常用 time, datetime处理

    Python处理时间 time  &&  datetime 模块

    import time 
    time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    

      

    5. 随机数

    import numpy as np
    np.random.rand(10)
    np.random.randn(10)
    np.random.randint(10, size=10)
    np.random.shuffle(x)    #类似洗牌,打乱顺序
    np.random.permutation(citys.index)   #返回一个随机排列
    
    import random
    random.random()   #只产生一个
    random.randint(1, 10)
    random.uniform()
    random.randrange()   #[random.randrange(0,9) for i in range(30)]
    random.shuffle(a)   #a=[1,2,3], 直接对a的值修改
    

      

    6. dataframe迭代(iteritems, iterrows, itertuples, apply

    import pandas as pd
    df = pd.DataFrame([{'c1':10, 'c2':100}, {'c1':11,'c2':110}, {'c1':12,'c2':120}])
    
    [(i,j) for i,j in df.T.iteritems()]
    
    [(i,j) for i,j in df.iterrows()]
    [(i, j['c1'], j['c2']) for i,j in df.iterrows()]
    
    [(getattr(row, "c1"), getattr(row, "c2")) for row in df.itertuples(index=True, name='Pandas')]
    
    def valuation_formula(x, y):
        return x * y * 0.5
    df['price'] = df.apply(lambda row: valuation_formula(row['c1'], row['c2']), axis=1)
    

      

    7. dataframe修改索引

    a.rename(columns={'A':'a', 'C':'c'}, inplace = True)   #dataframe修改列名
    a.index = ['1', '2', np.nan]     # dataframe修改索引
    

      

    8. 读写文件

      读文件

    import pandas as pd
    import os
    
    excel_data = pd.read_excel("./xx/xx.xlsx")
    csv_data = pd.read_csv("./xx/xx.csv")
    txt_data = pd.read_table("./xx/xx.txt")
    json_data = pd.read_json("./xx/xx.json")
    df = pd.read_clipboard(header=None)  #从剪切板读取文件,参数同 pd.read_table
    print('excel数据', os.linesep, excel_data)
    

      分组txt文件读写

    def write_list(path, li):
        extension = path.split('.')[-1]
        if extension == 'txt':
            with open(path, 'w', encoding='utf-8') as fp:
                #fp.writelines(li)
                fp.write('
    '.join(li))
        elif extension == 'csv':
            pd.DataFrame({'x':li}).to_csv(path, index=False, header=False)
        else:
            raise ValueError('扩展名只能指定为txt或csv...')
    
    def write_by_group(id_dict, outpath, size, extension):
        import os
        import math
        import pandas as pd
        
        if '/' in outpath:
            outfile = outpath + '/{}_{}.' + extension
        else:
            outfile = os.path.join(outpath, '{}_{}.txt')
            
        for gro_ in id_dict.keys():
            ids = id_dict[gro_]
            st, ed = 0, 0
            for i in range(math.ceil(len(ids)/size)):
                ed += size
                write_list(outfile.format(gro_, i), ids[st:ed])
                st = ed
        
    
    def main(infile, outfile, size, extension):
        from collections import defaultdict
        
        with open(infile, 'r', encoding='utf-8') as fp:
            id_dict = defaultdict(lambda: [])
            for s in fp.readlines():
                id_,gro_ = s.strip('
    ').split('|')
                id_dict[gro_] += [id_]
            
        write_by_group(id_dict, outpath, size, extension)
    
    if __name__ == "__main__":
        infile = r'e:新建文本文档.txt'
        outpath = 'e:/新建文件夹'
        size = 3
        main(infile, outpath, size, 'txt')
    

      txt文件读写

    import math
    def write_list(path, li):
    	with open(path, 'w', encoding='utf-8') as fp:
    		fp.writelines(li)
    def main():
    	with open('aa.txt', 'r', encoding='utf-8') as fp:
    		ids = [s.split('|')[0]+'
    ' for s in fp.readlines()]
    		
    	n = 10
    	size = math.ceil(len(ids)/n)
    	st, ed = 0, 0
    	for i in range(n):
    		ed += size
    		write_list('e:a{}.txt'.format(i), ids[st:ed])
    		st = ed
    
    if __name__ == "__main__":
    	main()
    

      内存文件读写

    from io import StringIO
    #f = StringIO()
    #f.write('Hello World!')
    f = StringIO('Hello World!')
    string = f.readlines()
    print(string)
    

      批量读取大文件 

    '''
    报错:OSError: Initializing from file failed
    解决:pd.read_csv('xxx.csv', chunksize=100, engine='python')
    '''
    df = pd.read_csv('xxx.csv', chunksize=1000000)
    for chunk in df: print(chunk.shape)
    
    df = pd.read_csv('xxx.csv', iterator=True)
    df.get_chunk(1000000).shape
    
    def get_df(file):
        mylist = []
        for chunk in pd.read_csv(file, chunksize=1000000):
            mylist.append(chunk)
        temp_df = pd.concat(mylist, axis=0)
        del mylist
        return temp_df.shape
    

       写文件

    df.to_csv("./xx/xx.csv", index=False, encoding='utf-8')
    df.to_excel("./xx/xx.xlsx") #utf_8_sig, gbk, gb2312, gb18030
    df.to_dict(orient = 'records')   #orient默认dict(字典套字典), 还可 ‘list’, ‘series’, ‘split’, ‘records’, ‘index’
    

      excel文件的读写

      python高手之路python处理excel文件

    import xlrd
    
    file_path = r'F:/test.xlsx'   #路径前加 r,读取的文件路径
    file_path = file_path.decode('utf-8')   #文件路径的中文转码
    
    data = xlrd.open_workbook(file_path)   #获取数据
    table = data.sheet_by_name('Sheet1')   #获取sheet
    
    nrows = table.nrows  #获取总行数
    ncols = table.ncols  #获取总列数
    rowvalue = table.row_values(5)  #获取一行的数值,例如第5行
    col_values = table.col_values(6)  #获取一列的数值,例如第6列
    cell_value = table.cell(5,6).value   #获取一个单元格的数值,例如第5行第6列
    

      excel文件读写

    import xlwt
    import xlrd
    from xlutils.copy import copy
    #打开需要操作的excel表
    wb=xlrd.open_workbook(path)
    #复制原有表
    newb=copy(wb)
    #新增sheet,参数是该sheet的名字,可自定义
    wbsheet=newb.add_sheet(dl+'-'+dn)
    #向新sheet中写入数据。本代码中的d是某个dataframe
    wbsheet.write(0,0,'date')
    wbsheet.write(0,1,'visited')
    wbsheet.write(0,2,'success')
    for i in range(d.shape[0]):
      wbsheet.write(i + 1, 0, d.iloc[i, 0])
      for j in range(1,d.shape[1]):
        wbsheet.write(i+1,j,int(d.iloc[i,j]))
    #获取原有excel表中sheet名为‘summary'的sheet
    sumsheet=newb.get_sheet('summary')
    #k表示该sheet的最后一行
    k=len(sumsheet.rows)
    #想原有sheet后面新增数据
    sumsheet.write(k,0,dl+'-'+dn)
    sumsheet.write(k,1,int(sum(d['visited'])))
    sumsheet.write(k,2,int(sum(d['success'])))
    #保存为原有的excel表路径
    newb.save(path)
    

      pd.read_excel,pd.to_excel参数

    pandas.read_excel(io, sheet_name=0, header=0, skiprows=None, 
    skip_footer=0, index_col=None, names=None, usecols=None, 
    parse_dates=False, date_parser=None, na_values=None, 
    thousands=None, convert_float=True, converters=None, 
    dtype=None, true_values=None, false_values=None, 
    engine=None, squeeze=False, **kwds)
    
    DataFrame.to_excel(excel_writer, sheet_name='Sheet1', na_rep='',
     float_format=None, columns=None, header=True, index=True, 
     index_label=None, startrow=0, startcol=0, engine=None, 
     merge_cells=True, encoding=None, inf_rep='inf', verbose=True, 
     freeze_panes=None)
    

      

       追加excel文件,不覆盖原有数据

    def excelAddSheet(dataframe, filepath, sheet_name):
        import os
        from openpyxl import load_workbook
        
        if os.path.exists(filepath):
            #当表名已存在时,后面还可以添加参数,进行追加
            book = load_workbook(filepath)  #FileNotFoundError
            excelWriter = pd.ExcelWriter(filepath, engine='openpyxl')
            excelWriter.book = book
            dataframe.to_excel(excel_writer=excelWriter, sheet_name=sheet_name, index=None)
            excelWriter.save()
            excelWriter.close()
        else:
            dataframe.to_excel(filepath, sheet_name=sheet_name, index=None)
        
           
    excelAddSheet(df2[:2], filepath, 'Sheet1')
    excelAddSheet(df2[:2], filepath, 'Sheet2')
    

      

    9. 数据库文件读写

    import pandas as pd
    from sqlalchemy import create_engine, types
    
    config = {db_flag : "mysql", host_ip : "127.0.0.1", host_port : 3306, db_name : "test", table_name : "aaa", user : "root", pawd : "root", charset : "utf-8"}
    engine_config = '''mysql+pymysql://{user}:{pawd}@{host_ip}:{host_port}/{db_name}?charset={charset}'''.format(**config)
    engine= create_engine(engine_config)
    pd.read_sql_query(sql, engine)
    conn = engine.connect()
    df.to_sql(table_name, conn, if_exists='append', index=False)
    

      

    10. 断言 - 单元测试

    assert 1==1
    assert 1==2
    #Traceback (most recent call last):
    #  File "<ipython-input-174-730332727407>", line 1, in <module>
    #    assert 1==2
    #AssertionError
    
    #等价于
    if not 1==2:
        raise AssertionError
    #Traceback (most recent call last):
    #  File "<ipython-input-176-bd2dcc9e33c8>", line 2, in <module>
    #    raise AssertionError
    #AssertionError
    
    #try ... except .. else
    #自定义新异常
    class Networkerror(RuntimeError):
        def __init__(self, arg):
            self.args = arg
    
    try:
        raise Networkerror("Bad hostname")
    except Networkerror,e:
        print e.args
    

      11.Python读取大文件的"坑“与内存占用检测

      

      

     
  • 相关阅读:
    存储引擎的优缺点及增删改查基本操作
    安装Mariadb
    Mysql 入门概念
    Nginx语法着色
    find用法,文件压缩和lsof和cpio
    软件包管理
    Django 生成六位随机图片验证码
    Django自定义过滤器和自定义标签
    Django零碎知识点
    jQuery实现淡入淡出样式轮播
  • 原文地址:https://www.cnblogs.com/iupoint/p/11138798.html
Copyright © 2011-2022 走看看