{ "metadata": { "name": "", "signature": "sha256:b0eb9b534082cfefbc26df49a4e010771e27afb7ad25a9e4b43276f6c5ea724e" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "# Java Class File Analysis\n", "In this notebook we're going to explore, understand, and classify java class files as being 'benign' or 'malicious'. We will explore the data, apply machine learning algorithms to the data, add new features, do more machine learning. Then we will test our classifier on a large amount of files to measure it's effectiveness.\n", "\n", "### References\n", "\n", "\n", " | acc_abstract | \n", "acc_annotation | \n", "acc_enum | \n", "acc_final | \n", "acc_interface | \n", "acc_public | \n", "acc_super | \n", "acc_synthetic | \n", "ap_count | \n", "class name | \n", "class_name_digit_run_avg | \n", "class_name_digit_run_longest | \n", "class_name_length | \n", "class_name_lowercase_run_avg | \n", "class_name_lowercase_run_longest | \n", "class_name_slash_count | \n", "class_name_uppercase_run_avg | \n", "class_name_uppercase_run_longest | \n", "constant_pool_count | \n", "entropy | \n", "\n", " |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "3 | \n", "com/google/common/collect/ForwardingConcurrentMap | \n", "0 | \n", "0 | \n", "49 | \n", "6.000000 | \n", "9 | \n", "4 | \n", "1.0 | \n", "1 | \n", "54 | \n", "4.990507 | \n", "... | \n", "
1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "org/apache/hadoop/io/compress/GzipCodec$GzipOu... | \n", "0 | \n", "0 | \n", "82 | \n", "4.846154 | \n", "8 | \n", "5 | \n", "1.5 | \n", "5 | \n", "39 | \n", "5.205063 | \n", "... | \n", "
2 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "2 | \n", "com/google/common/collect/Multisets$Unmodifiab... | \n", "0 | \n", "0 | \n", "62 | \n", "6.625000 | \n", "11 | \n", "4 | \n", "1.0 | \n", "1 | \n", "131 | \n", "4.996721 | \n", "... | \n", "
3 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "hu/openig/mechanics/StaticDefensePlanner$1 | \n", "1 | \n", "1 | \n", "42 | \n", "5.666667 | \n", "9 | \n", "3 | \n", "1.0 | \n", "1 | \n", "56 | \n", "5.282413 | \n", "... | \n", "
4 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "2 | \n", "org/apache/commons/io/LineIterator | \n", "0 | \n", "0 | \n", "34 | \n", "4.666667 | \n", "7 | \n", "4 | \n", "1.0 | \n", "1 | \n", "95 | \n", "5.285082 | \n", "... | \n", "
5 rows \u00d7 36 columns
\n", "\n", " | acc_final | \n", "acc_public | \n", "acc_super | \n", "ap_count | \n", "class name | \n", "class_name_digit_run_avg | \n", "class_name_digit_run_longest | \n", "class_name_length | \n", "class_name_lowercase_run_avg | \n", "class_name_lowercase_run_longest | \n", "class_name_slash_count | \n", "class_name_uppercase_run_avg | \n", "class_name_uppercase_run_longest | \n", "constant_pool_count | \n", "entropy | \n", "interface_count | \n", "interfaces | \n", "major version | \n", "method names | \n", "method_name_digit_run_avg | \n", "\n", " |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0 | \n", "1 | \n", "1 | \n", "2 | \n", "Main | \n", "0 | \n", "0 | \n", "4 | \n", "3.0 | \n", "3 | \n", "0 | \n", "1.000000 | \n", "1 | \n", "86 | \n", "6.114522 | \n", "0 | \n", "[] | \n", "48 | \n", "[<init>, init] | \n", "0 | \n", "... | \n", "
1 | \n", "0 | \n", "0 | \n", "1 | \n", "1 | \n", "YdCdHX/VcZaXVjyy | \n", "0 | \n", "0 | \n", "16 | \n", "1.4 | \n", "3 | \n", "1 | \n", "1.333333 | \n", "2 | \n", "52 | \n", "5.539514 | \n", "0 | \n", "[] | \n", "49 | \n", "[<init>, ktCgxlqo, <clinit>] | \n", "0 | \n", "... | \n", "
2 | \n", "0 | \n", "1 | \n", "1 | \n", "2 | \n", "aOcMSp | \n", "0 | \n", "0 | \n", "6 | \n", "1.0 | \n", "1 | \n", "0 | \n", "1.500000 | \n", "2 | \n", "159 | \n", "5.953528 | \n", "0 | \n", "[] | \n", "49 | \n", "[<init>, gvuNr, <clinit>] | \n", "0 | \n", "... | \n", "
3 | \n", "0 | \n", "0 | \n", "1 | \n", "1 | \n", "a/zylasqwjlpbqyrwrr | \n", "0 | \n", "0 | \n", "19 | \n", "9.0 | \n", "17 | \n", "1 | \n", "0.000000 | \n", "0 | \n", "478 | \n", "6.348531 | \n", "0 | \n", "[] | \n", "49 | \n", "[<init>, eiaxyercdfvbgscpbv, yginlmcynkyuohnfh... | \n", "0 | \n", "... | \n", "
4 | \n", "0 | \n", "1 | \n", "1 | \n", "2 | \n", "tljpjunbjwtqlywm/sdnrybknlf | \n", "0 | \n", "0 | \n", "27 | \n", "13.0 | \n", "16 | \n", "1 | \n", "0.000000 | \n", "0 | \n", "122 | \n", "5.376762 | \n", "0 | \n", "[] | \n", "49 | \n", "[<init>, dvvwse, <clinit>] | \n", "0 | \n", "... | \n", "
5 rows \u00d7 31 columns
\n", "\n", " | method name | \n", "
---|---|
0 | \n", "<init> | \n", "
1 | \n", "init | \n", "
2 | \n", "ktCgxlqo | \n", "
3 | \n", "<clinit> | \n", "
4 | \n", "gvuNr | \n", "
5 | \n", "eiaxyercdfvbgscpbv | \n", "
6 | \n", "yginlmcynkyuohnfhe | \n", "
7 | \n", "mtyvzetsjhvnbyz | \n", "
8 | \n", "fxxhgjttqfavlooxcb | \n", "
9 | \n", "wyjgamzmowywjihkuuf | \n", "
10 | \n", "kgthsnqdqutacivcptong | \n", "
11 | \n", "qgasjqrogibkblyzourtq | \n", "
12 | \n", "glfouhczfxzyskaystx | \n", "
13 | \n", "mikczoanebdkwpyb | \n", "
14 | \n", "bwssduenvebnvgix | \n", "
15 | \n", "wafrcwijizypmitodmb | \n", "
16 | \n", "bfznyeevclzzxxqbw | \n", "
17 | \n", "jmzisxwtxhekbkl | \n", "
18 | \n", "szivddjiptybevduli | \n", "
19 | \n", "forwnxmgnutbtdwvptj | \n", "
20 | \n", "mwwmrvljafpkwzdiy | \n", "
21 | \n", "vvpbdzrhvvnzaieyi | \n", "
22 | \n", "qkkxoygluwwlnwbxu | \n", "
23 | \n", "dvvwse | \n", "
24 | \n", "c | \n", "
25 | \n", "k | \n", "
26 | \n", "main | \n", "
27 | \n", "writeEmbeddedFile | \n", "
28 | \n", "bootstrap | \n", "
29 | \n", "getJreExecutable | \n", "
30 | \n", "addExtension | \n", "
31 | \n", "findInDir | \n", "
32 | \n", "normalize | \n", "
33 | \n", "dissect | \n", "
34 | \n", "class$ | \n", "
35 | \n", "tgznSIAR | \n", "
36 | \n", "kWfVWtw | \n", "
37 | \n", "BodFzDax | \n", "
38 | \n", "xXVBwx | \n", "
39 | \n", "VdJiGyZfj | \n", "
40 | \n", "taddhnwrkj | \n", "
41 | \n", "C | \n", "
42 | \n", "ALLATORI_DEMO | \n", "
43 | \n", "jvsamhqyvgekftsj | \n", "
44 | \n", "knjkb | \n", "
45 | \n", "B | \n", "
46 | \n", "cmjnkr | \n", "
47 | \n", "jmdpes | \n", "
48 | \n", "tqffjybms | \n", "
49 | \n", "vtvtmh | \n", "
50 rows \u00d7 1 columns
\n", "\n", " | method name | \n", "
---|---|
0 | \n", "<init> | \n", "
1 | \n", "delegate | \n", "
2 | \n", "putIfAbsent | \n", "
3 | \n", "remove | \n", "
4 | \n", "replace | \n", "
5 | \n", "resetState | \n", "
6 | \n", "comparator | \n", "
7 | \n", "createElementSet | \n", "
8 | \n", "elementSet | \n", "
9 | \n", "descendingMultiset | \n", "
10 | \n", "firstEntry | \n", "
11 | \n", "lastEntry | \n", "
12 | \n", "pollFirstEntry | \n", "
13 | \n", "pollLastEntry | \n", "
14 | \n", "headMultiset | \n", "
15 | \n", "subMultiset | \n", "
16 | \n", "tailMultiset | \n", "
17 | \n", "invoke | \n", "
18 | \n", "hasNext | \n", "
19 | \n", "isValidLine | \n", "
20 | \n", "next | \n", "
21 | \n", "nextLine | \n", "
22 | \n", "close | \n", "
23 | \n", "closeQuietly | \n", "
24 | \n", "exec | \n", "
25 | \n", "getInitial | \n", "
26 | \n", "getIntermed | \n", "
27 | \n", "getFinal | \n", "
28 | \n", "max | \n", "
29 | \n", "outputSchema | \n", "
30 | \n", "estimateLength | \n", "
31 | \n", "appendTo | \n", "
32 | \n", "getXPath | \n", "
33 | \n", "run | \n", "
34 | \n", "secToHMS | \n", "
35 | \n", "contribute | \n", "
36 | \n", "onBeforeRender | \n", "
37 | \n", "setCloseEvent | \n", "
38 | \n", "setSelectEvent | \n", "
39 | \n", "setChangeEvent | \n", "
40 | \n", "setSource | \n", "
41 | \n", "statement | \n", "
42 | \n", "setDocumentLocator | \n", "
43 | \n", "startDocument | \n", "
44 | \n", "endDocument | \n", "
45 | \n", "startPrefixMapping | \n", "
46 | \n", "endPrefixMapping | \n", "
47 | \n", "startElement | \n", "
48 | \n", "endElement | \n", "
49 | \n", "characters | \n", "
50 rows \u00d7 1 columns
\n", "\n", " | acc_abstract | \n", "acc_annotation | \n", "acc_enum | \n", "acc_final | \n", "acc_interface | \n", "acc_public | \n", "acc_super | \n", "acc_synthetic | \n", "ap_count | \n", "attributes count | \n", "class name | \n", "class_name_digit_run_avg | \n", "class_name_digit_run_longest | \n", "class_name_length | \n", "class_name_lowercase_run_avg | \n", "class_name_lowercase_run_longest | \n", "class_name_slash_count | \n", "class_name_uppercase_run_avg | \n", "class_name_uppercase_run_longest | \n", "constant_pool_count | \n", "\n", " |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "2 | \n", "1 | \n", "com/jidesoft/combobox/DateChooserPanel | \n", "0 | \n", "0 | \n", "38 | \n", "5.333333 | \n", "8 | \n", "3 | \n", "1 | \n", "1 | \n", "1037 | \n", "... | \n", "
1 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "3 | \n", "0 | \n", "org/jmol/modelset/BondIterator | \n", "0 | \n", "0 | \n", "30 | \n", "5.000000 | \n", "8 | \n", "3 | \n", "1 | \n", "1 | \n", "11 | \n", "... | \n", "
2 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "2 | \n", "2 | \n", "org/hibernate/engine/query/ParameterParser | \n", "0 | \n", "0 | \n", "42 | \n", "6.000000 | \n", "9 | \n", "4 | \n", "1 | \n", "1 | \n", "152 | \n", "... | \n", "
3 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "2 | \n", "1 | \n", "com/intellij/updater/Utils | \n", "0 | \n", "0 | \n", "26 | \n", "5.500000 | \n", "8 | \n", "3 | \n", "1 | \n", "1 | \n", "330 | \n", "... | \n", "
4 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "2 | \n", "1 | \n", "com/kiwisoft/db/driver/SybaseDriver | \n", "0 | \n", "0 | \n", "35 | \n", "4.833333 | \n", "8 | \n", "4 | \n", "1 | \n", "1 | \n", "151 | \n", "... | \n", "
5 rows \u00d7 35 columns
\n", "