阿里云安全恶意程序检测-排名295
发布日期:2021-06-29 19:49:15 浏览次数:4 分类:技术文章

本文共 9084 字,大约阅读时间需要 30 分钟。

赛题说明

本题目提供的数据来自文件(windows 可执行程序)经过沙箱程序模拟运行后的API指令序列,全为windows二进制可执行程序,经过脱敏处理。
本题目提供的样本数据均来自于从互联网。其中恶意文件的类型有感染型病毒、木马程序、挖矿程序、DDOS木马、勒索病毒等,数据总计6亿条。

具体请移步:

数据说明在这里插入图片描述

简单思路:

数据量过大,改变数据类型减少内存使用
交叉验证
lgb效果还不错

在这里插入图片描述

具体代码:

from tqdm import tqdm_notebookclass _Data_Preprocess:    def __init__(self):        self.int8_max = np.iinfo(np.int8).max        self.int8_min = np.iinfo(np.int8).min        self.int16_max = np.iinfo(np.int16).max        self.int16_min = np.iinfo(np.int16).min        self.int32_max = np.iinfo(np.int32).max        self.int32_min = np.iinfo(np.int32).min        self.int64_max = np.iinfo(np.int64).max        self.int64_min = np.iinfo(np.int64).min        self.float16_max = np.finfo(np.float16).max        self.float16_min = np.finfo(np.float16).min        self.float32_max = np.finfo(np.float32).max        self.float32_min = np.finfo(np.float32).min        self.float64_max = np.finfo(np.float64).max        self.float64_min = np.finfo(np.float64).min    def _get_type(self, min_val, max_val, types):        if types == 'int':            if max_val <= self.int8_max and min_val >= self.int8_min:                return np.int8            elif max_val <= self.int16_max <= max_val and min_val >= self.int16_min:                return np.int16            elif max_val <= self.int32_max and min_val >= self.int32_min:                return np.int32            return None        elif types == 'float':            if max_val <= self.float16_max and min_val >= self.float16_min:                return np.float16            if max_val <= self.float32_max and min_val >= self.float32_min:                return np.float32            if max_val <= self.float64_max and min_val >= self.float64_min:                return np.float64            return None    def _memory_process(self, df):        init_memory = df.memory_usage().sum() / 1024 ** 2 / 1024        print('Original data occupies {} GB memory.'.format(init_memory))        df_cols = df.columns        for col in tqdm_notebook(df_cols):            try:                if 'float' in str(df[col].dtypes):                    max_val = df[col].max()                    min_val = df[col].min()                    trans_types = self._get_type(min_val, max_val, 'float')                    if trans_types is not None:                        df[col] = df[col].astype(trans_types)                elif 'int' in str(df[col].dtypes):                    max_val = df[col].max()                    min_val = df[col].min()                    trans_types = self._get_type(min_val, max_val, 'int')                    if trans_types is not None:                        df[col] = df[col].astype(trans_types)            except:                print(' Can not do any process for column, {}.'.format(col))        afterprocess_memory = df.memory_usage().sum() / 1024 ** 2 / 1024        print('After processing, the data occupies {} GB memory.'.format(afterprocess_memory))        return dfimport pandas as pdimport numpy as npimport seaborn as snsimport matplotlib.pyplot as pltimport lightgbm as lgbimport warningswarnings.filterwarnings('ignore')import osos.chdir(r'E:\项目文件\阿里云安全恶意程序检测')train = pd.read_csv('security_train.csv')test = pd.read_csv('security_test.csv')def simple_sts_features(df):    simple_fea = pd.DataFrame()    simple_fea['file_id'] = df['file_id'].unique()    simple_fea = simple_fea.sort_values('file_id')    df_grp = df.groupby('file_id')    simple_fea['file_id_api_count'] = df_grp['api'].count().values    simple_fea['file_id_api_nunique'] = df_grp['api'].nunique().values    simple_fea['file_id_tid_count'] = df_grp['tid'].count().values    simple_fea['file_id_tid_nunique'] = df_grp['tid'].nunique().values    simple_fea['file_id_index_count'] = df_grp['index'].count().values    simple_fea['file_id_index_nunique'] = df_grp['index'].nunique().values    return simple_feasimple_train_fea1 = simple_sts_features(train)simple_test_fea1 = simple_sts_features(test)def simple_numerical_sts_features(df):    simple_numerical_fea = pd.DataFrame()    simple_numerical_fea['file_id'] = df['file_id'].unique()    simple_numerical_fea = simple_numerical_fea.sort_values('file_id')    df_grp = df.groupby('file_id')    simple_numerical_fea['file_id_tid_mean'] = df_grp['tid'].mean().values    simple_numerical_fea['file_id_tid_min'] = df_grp['tid'].min().values    simple_numerical_fea['file_id_tid_std'] = df_grp['tid'].std().values    simple_numerical_fea['file_id_tid_max'] = df_grp['tid'].max().values    simple_numerical_fea['file_id_index_mean'] = df_grp['index'].mean().values    simple_numerical_fea['file_id_index_min'] = df_grp['index'].min().values    simple_numerical_fea['file_id_index_std'] = df_grp['index'].std().values    simple_numerical_fea['file_id_index_max'] = df_grp['index'].max().values    return simple_numerical_feasimple_train_fea2 = simple_numerical_sts_features(train)simple_test_fea2 = simple_numerical_sts_features(test)train_label = train[['file_id', 'label']].drop_duplicates(subset=['file_id', 'label'], keep='first')test_submit = test[['file_id']].drop_duplicates(subset=['file_id'], keep='first')### 训练集&测试集构建train_data = train_label.merge(simple_train_fea1, on='file_id', how='left')train_data = train_data.merge(simple_train_fea2, on='file_id', how='left')test_submit = test_submit.merge(simple_test_fea1, on='file_id', how='left')test_submit = test_submit.merge(simple_test_fea2, on='file_id', how='left')def lgb_logloss(preds, data):    labels_ = data.get_label()    classes_ = np.unique(labels_)    preds_prob = []    for i in range(len(classes_)):        preds_prob.append(preds[i * len(labels_):(i + 1) * len(labels_)])    preds_prob_ = np.vstack(preds_prob)    loss = []    for i in range(preds_prob_.shape[1]):  # 样本个数        sum_ = 0        for j in range(preds_prob_.shape[0]):  # 类别个数            pred = preds_prob_[j, i]  # 第i个样本预测为第j类的概率            if j == labels_[i]:                sum_ += np.log(pred)            else:                sum_ += np.log(1 - pred)        loss.append(sum_)    return 'loss is: ', -1 * (np.sum(loss) / preds_prob_.shape[1]), False### 模型验证train_features = [col for col in train_data.columns if col not in ['label', 'file_id']]train_label = 'label'from sklearn.model_selection import KFoldparams = {
'task': 'train', 'num_leaves': 255, 'objective': 'multiclass', 'num_class': 8, 'min_data_in_leaf': 50, 'learning_rate': 0.05, 'feature_fraction': 0.85, 'bagging_fraction': 0.85, 'bagging_freq': 5, 'max_bin': 128}folds = KFold(n_splits=5, shuffle=True, random_state=15)oof = np.zeros(len(train))predict_res = 0models = []for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_data)): print("fold n°{}".format(fold_)) trn_data = lgb.Dataset(train_data.iloc[trn_idx][train_features], label=train_data.iloc[trn_idx][train_label].values) val_data = lgb.Dataset(train_data.iloc[val_idx][train_features], label=train_data.iloc[val_idx][train_label].values) clf = lgb.train(params, trn_data, num_boost_round=2000, valid_sets=val_data, verbose_eval=50, early_stopping_rounds=100, feval=lgb_logloss) models.append(clf)from sklearn.model_selection import KFoldparams = {
'task': 'train', 'num_leaves': 255, 'objective': 'multiclass', 'num_class': 8, 'min_data_in_leaf': 50, 'learning_rate': 0.05, 'feature_fraction': 0.85, 'bagging_fraction': 0.85, 'bagging_freq': 5, 'max_bin': 128}folds = KFold(n_splits=5, shuffle=True, random_state=15)oof = np.zeros(len(train))models = []predict_res = 0for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_data)): print("fold n°{}".format(fold_)) trn_data = lgb.Dataset(train_data.iloc[trn_idx][train_features], label=train_data.iloc[trn_idx][train_label].values) val_data = lgb.Dataset(train_data.iloc[val_idx][train_features], label=train_data.iloc[val_idx][train_label].values) clf = lgb.train(params, trn_data, num_boost_round=2000, valid_sets=[trn_data, val_data], verbose_eval=50, early_stopping_rounds=100, feval=lgb_logloss) models.append(clf)feature_importance = pd.DataFrame()feature_importance['fea_name'] = train_featuresfeature_importance['fea_imp'] = clf.feature_importance()feature_importance = feature_importance.sort_values('fea_imp', ascending=False)plt.figure(figsize=[20, 10, ])sns.barplot(x=feature_importance['fea_name'], y=feature_importance['fea_imp'])# sns.barplot(x="fea_name",y="fea_imp",data=feature_importance)pred_res = 0fold = 5for model in models: pred_res += model.predict(test_submit[train_features]) * 1.0 / foldtest_submit['prob0'] = 0test_submit['prob1'] = 0test_submit['prob2'] = 0test_submit['prob3'] = 0test_submit['prob4'] = 0test_submit['prob5'] = 0test_submit['prob6'] = 0test_submit['prob7'] = 0test_submit[['prob0', 'prob1', 'prob2', 'prob3', 'prob4', 'prob5', 'prob6', 'prob7']] = pred_restest_submit[['file_id', 'prob0', 'prob1', 'prob2', 'prob3', 'prob4', 'prob5', 'prob6', 'prob7']].to_csv('baseline.csv', index=False)

喜欢记得一键三连

转载地址:https://data-mining.blog.csdn.net/article/details/109557312 如侵犯您的版权,请留言回复原文章的地址,我们会给您删除此文章,给您带来不便请您谅解!

上一篇:天池新人赛-工业蒸汽量预测-排名150
下一篇:天池新人赛-快来一起挖掘幸福感-排名318

发表评论

最新留言

初次前来,多多关照!
[***.217.46.12]2024年04月18日 08时07分12秒

关于作者

    喝酒易醉,品茶养心,人生如梦,品茶悟道,何以解忧?唯有杜康!
-- 愿君每日到此一游!

推荐文章