import numpy as np from pandas import * from pypinyin import pinyin, lazy_pinyin #prepare the dataframe def PrepDf(fname): path=''#'/home/sespub/teds10/08-時間分配權重/月週日時間權重/' df=read_csv(path+fname,encoding='big5') df.loc[df.NSC_SUB.map(lambda x: isna(x)),'NSC_SUB']='b' df['nsc2']=[str(x)+y for x,y in zip(df['NSC'],df['NSC_SUB'])] for i in range(len(df.REGION)): cha=df.loc[i,'REGION'] if type(cha) in [int,float]:continue ll=lazy_pinyin(cha) if len(ll)==0:continue s='' for l in ll: s=s+l df.loc[i,'REGION']=s a=df.loc[df.DICT.map(lambda x:type(x)==str and '、' in x)] idx=a.index df=df.drop(idx).reset_index(drop=True) a=a.reset_index(drop=True) for i in range(len(a)): dct=a.loc[i,'DICT'].split('、') b=DataFrame({}) for j in range(len(dct)): b=b.append(a.loc[i],ignore_index=True) b.loc[j,'DICT']=dct[j] df=df.append(b,ignore_index=True) idx=df.loc[df.REGION.map(lambda x:type(x)==float and np.isnan(x))].index df.loc[idx,'REGION']=['quanguo' for i in idx] for c in df.columns: if c in ['REGION','NSC','NSC_SUB','nsc2','DICT']:continue df[c]=[float(i) for i in list(df[c])] #change 51b to 51A~51D, no need changing it manually snsc2=set(df.nsc2) if '51b' in snsc2: df51=df.loc[df.nsc2=='51b'].reset_index(drop=True) for s in 'ABCD': if '51A' not in snsc2: tmp=df51 tmp.nsc2='51'+s df=df.append(tmp,ignore_index='True',sort=False) # taizhongjichang is missing, fill it according tainanjichang if 'taizhongjichang' not in set(df.REGION): a=df.loc[df.REGION=='tainanjichang'].reset_index(drop=True) a.DICT=3605 a.REGION='taizhongjichang' df=df.append(a,ignore_index=True) df.drop_duplicates(inplace=True) df=df.reset_index(drop=True) return df