Python DM & ML
Objects¶
Series¶
## create Series
obj = pd.Series([1,9,1,9], index = ['a','b','c','u'])
# or using dictionaries
obj = {'a':1, 'b':9, 'c':1, 'd':9}
obj.values
obj.index
# change the index
obj.index = ['index0', 'index1', ...]
# name the series
obj.name = 'obj_name'
# name the index of the series
obj.index.name = 'index_name'
obj[['a', 'b']]
obj[obj > 8] # index by boolean
# treat series as a set-length ordered dictionary
'b' in obj # search in the keys (indices)
## operation on series or data frames
obj * 2
np.exp(obj)
# apply a function elementwise
def diy_func(ele):
return ele ** 2 - 1
obj.apply(diy_func)
Data Frames¶
Creating data frames
## Create data frames
my_dict = {
"key1": value1,
"key2": value2,
"key3": value3
}
# Creating DataFrames from a list of dictionaries (by row)
lisf_of_dicts = [{"name":zoe, "breed":human}, {"name":eplus, , "breed":human}]
df = pd.DataFrame(list_of_dicts)
# From dictionaries of lists (by column)
dicts_of_lists = {"name":["zoe", "eplus"],
"breed":["human", "human"]}
df = pd.DataFrame(dicts_of_lists)
# from csv file: comma-seperated values (designed for DataFrame-like data)
df = pd.read_csv("path_to_the_csv_file")
df.to_csv("output_path")
Navigate data frames & basic operations.
df.columns
df.loc["col1", "col2", ...]
# for multi-level index
df.loc[[('col0', 'col10'), ('col2', 'col21')]]
df.iloc[4,]
## deal with index
df.index
df.set_index("index_name")
df.reset_index() # remove the index
df.reset_index(drop=True) # discard the index
# sort index by some columns
df.sort_index(level=["col1", "col2", ...], ascending=[True, False])
## filtering
filter_list = ['identity1', 'identity2', ...]
df[df['col'].isin(filter_list)]
## opearations
# drop column
df.drop("col", axis=1)
# drop row
df.drop([0,1], axis=0)
Using pivot table to get statistics.
# pivot tables
df = df.pivot_table(values="valut_to_be_process", index=["index_1", "index_2"], columns="col") # the default aggfunc is mean()
Opearting with Strings¶
# split
df['string_col'].str.split(':')
# return the first of the splited ones
df['string_col'].str.split(':').str.get(0)
# return a new DataFrame
df['string_col'].str.split(':',expand=True)
df[['main_string_col', 'substring_col']] = df['string_col'].str.split(':', expand=True)
df.drop('string_col', axis=1, inplace=True)
# concatenate
df_new['name_author'].str.cat(df_new['lastname_author'], sep=' ')
# can also concatenate / split index with other columns
df.index.str.cat(df['string_col'], sep='-', expand=True)
df.index.str.split('-', expand=True)
Operating with Date¶
# acquire the components of a date
df["col"].dt.mouth
df["col"].dt.year
Visualization¶
import matplotlib.pyplot as plt
# histograms
df["col"].hist(bins=20) # change the bin number
# bar plots, for categorical and numeric variables
df.plot(kind="bar", title="title")
# line plots
df.plot(x="x-axis-label", y="y-axis-label", kind="line", rot=45)
# scatter plots, for relationships between numeric variables
df.plot(x="x-axis-label", y="y-axis-label", kind="scatter")
# layering plots
df[some conditions].hist(alpha=0.7) # transparency
df[some conditions].hist(alpha=0.7)
plt.legend(["plot-1", "plot-2"])
plt.show()
Data Preprosessing¶
Factor¶
imcomplete¶
# similar to R's factor(vector)
from sklearn import preprocessing
Missing Valuse¶
# Null value
obj = pd.Series({'a':1, 'b':NaN})
pd.isnull(obj) # obj.isnull()
df.isnull().sum() # check all the columns
pd.notnull(obj) # obj.notnull()
# null's propogation
pd.isnull(obj) / obj.shape[0]
# encode other symbol to null value (suppose the missing value are annotated as '?')
df[df == '?'] = np.nan
df.isnull()
Impute Missing Values with:¶
## mode
col = ... # list or string
# for col in co:
df[col].fillna(df[col].mode()[0], inplace=True)
OS¶
import os
# acquire dirctory items
directory_path = 'D:/Data/Path'
entries = os.listdir(directory_path)
for entry in entries:
print(entry)
# acquire item path
entry_0_path = os.path.join(directory_path, entry0)
Model Selection¶
Split Data¶
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = tarin_test_split(X, Y, test_size=0.3, random_state=0)
Scikit-learn 数据预处理¶
标准化: StrandardScalar
将特征缩放到某个范围:
处理异常值: Robustscalar
核矩阵的中心化:
非线性转换
分位数转换函数 QuantileTransformer
归一化函数 normalize
生成多项式特征;
样条函数特征生成函数 SpineTransfromer
自定义转换器 FunctionTransformer
Sklearn 处理缺失值¶
单变量插补(univariate feature imputation)
多变量插补:
最近邻归因
删除低方差的 VarianceThreshhold
生物信息¶
epi.pp
preprocessing
remove low quality cells:
epi.pp.filter_cells(adata, min_features = 10)
epi.pp.filter_features(adata, min_cells = 10)
reduce feature space to
the most variable features check
epi.pl.cal_var(adata)
epi.pp.select_var_feature(adata, max_score = 0.2, nb_features = 50000)