Merge pull request #55 from awest1339/celery

Add standalone docker containers
mitre · Nov 16, 2017 · d60a268 · d60a268
2 parents 2889e04 + d123eee
commit d60a268
Show file tree

Hide file tree

Showing 23 changed files with 999 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -78,6 +78,10 @@ Results is a dictionary object where each key is a filename of a scanned file.
 `multiscanner.config_init(filepath)` will create a default configuration file at
 the location defined by filepath.
 
+Distributed MultiScanner
+------------------------
+MultiScanner is also part of a distributed, scalable file analysis framework, complete with distributed task management, web interface, REST API, and report storage. Please set [Distributed Multiscanner](<docs/distributed_multiscanner.md>) for more details. Additionally, we distribute a standalone Docker container with the base set of features (web UI, REST API, ElasticSearch node) as an introduction to the capabilities of this Distributed MultiScanner. See [here](<docs/docker_standalone.md>) for more details. (*Note*: this standalone container should not be used in production, it is simply a primer on what a full installation would look like).
+
 Other Reading
 -------------
 For more information on module configuration or writing modules check the

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,47 @@
+version: '3'
+services:
+  elastic:
+    image: "docker.elastic.co/elasticsearch/elasticsearch:5.6.3"
+    ports:
+     - "9200:9200"
+     - "9300:9300"
+    environment:
+     - "discovery.type=single-node"
+     - "script.painless.regex.enabled=true"
+     - "xpack.security.enabled=false"
+  web:
+    build:
+      context: .
+      dockerfile: docker_utils/Dockerfile_web
+      # If you are behind a proxy, you must uncomment
+      # the next 3 lines
+      # args:
+      #  - http_proxy
+      #  - https_proxy
+    ports:
+     - "127.0.0.1:8000:8000"
+    # If you are behind a proxy, you must set the
+    # proxy settings here (uncomment the next 4 lines)
+    # environment:
+    #  - "http_proxy=http://proxy.example:80"
+    #  - "https_proxy=http://proxy.example:80"
+    #  - "no_proxy=localhost,127.0.0.1"
+  api:
+    build:
+      context: .
+      dockerfile: docker_utils/Dockerfile_api
+      # If you are behind a proxy, you must uncomment
+      # the next 3 lines
+      # args:
+      #  - http_proxy
+      #  - https_proxy
+    ports:
+     - "127.0.0.1:8080:8080"
+    # If you are behind a proxy, you must set the
+    # proxy settings here (uncomment the next 4 lines)
+    # environment:
+    #  - "http_proxy=http://proxy.example:80"
+    #  - "https_proxy=http://proxy.example:80"
+    #  - "no_proxy=localhost,127.0.0.1"
+    depends_on:
+     - elastic
diff --git a/docker_utils/Dockerfile_api b/docker_utils/Dockerfile_api
@@ -0,0 +1,16 @@
+FROM python:3.6
+MAINTAINER Austin West awest1339
+
+ENV PYTHON_VER 3.6.2
+
+COPY . /opt/multiscanner
+COPY ./docker_utils/*.ini /opt/multiscanner/
+
+WORKDIR /opt/multiscanner
+RUN ./install.sh
+
+# Run script
+# The sleep is a workaround to make extra sure that
+# the ElasticSearch container is up before the api
+# attempts to connect to it
+CMD sleep 30; python utils/api.py
diff --git a/docker_utils/Dockerfile_web b/docker_utils/Dockerfile_web
@@ -0,0 +1,13 @@
+FROM python:3.6
+MAINTAINER Austin West awest1339
+
+ENV PYTHON_VER 3.6.2
+
+COPY . /opt/multiscanner
+COPY ./docker_utils/*.ini /opt/multiscanner/
+
+WORKDIR /opt/multiscanner
+RUN ./install.sh
+
+# Run script
+CMD python web/app.py
diff --git a/docker_utils/api_config.ini b/docker_utils/api_config.ini
@@ -0,0 +1,26 @@
+[api]
+host = 0.0.0.0
+port = 8080
+upload_folder = /tmp/
+distributed = False
+web_loc = http://localhost:8000/
+cors = https?://localhost(:\d+)?
+batch_size = 100
+batch_interval = 2
+
+[celery]
+protocol = pyamqp
+host = localhost
+user = guest
+password = 
+vhost = /
+flush_every = 100
+flush_interval = 10
+
+[Database]
+db_type = sqlite
+host_string = localhost
+db_name = task_db
+username = multiscanner
+password = CHANGEME
+
diff --git a/docker_utils/config.ini b/docker_utils/config.ini
@@ -0,0 +1,140 @@
+[AVGScan]
+path = C:\Program Files\AVG\AVG2014\avgscanx.exe
+key = /opt/multiscanner/etc/id_rsa
+cmdline = ['/A', '/H', '/PRIORITY=High']
+host = ('MultiScanner', 22, 'User')
+replacement path = X:\
+ENABLED = False
+
+[ClamAVScan]
+ENABLED = False
+
+[MSEScan]
+path = C:\Program Files\Microsoft Security Client\MpCmdRun.exe
+key = /opt/multiscanner/etc/id_rsa
+cmdline = ['-Scan', '-ScanType', '3', '-DisableRemediation', '-File']
+host = ('MultiScanner', 22, 'User')
+replacement path = X:\
+ENABLED = False
+
+[McAfeeScan]
+path = C:\vscl-w32-604-e\scan.exe
+key = /opt/multiscanner/etc/id_rsa
+cmdline = ['/ALL']
+host = ('MultiScanner', 22, 'User')
+replacement path = X:\
+ENABLED = False
+
+[Metadefender]
+ENABLED = False
+API URL = http://metadefender:8008/
+timeout = 60
+running timeout = 30
+fetch delay seconds = 5
+poll interval seconds = 5
+user agent = user_agent
+API key = 
+
+[vtsearch]
+apikey = None
+ENABLED = False
+
+[NSRL]
+hash_list = /opt/multiscanner/etc/nsrl/hash_list
+offsets = /opt/multiscanner/etc/nsrl/offsets
+ENABLED = False
+
+[Cuckoo]
+ENABLED = False
+API URL = http://cuckoo:8090/
+WEB URL = http://cuckoo:80/
+timeout = 360
+running timeout = 120
+delete tasks = False
+maec = False
+
+[FireeyeAPI]
+API URL = https://fireeye/wsapis/v1.1.0
+fireeye images = ['win7-sp1', 'win7x64-sp1', 'winxp-sp3']
+username = api_analyst
+password = Password123
+info level = normal
+timeout = 500
+force = False
+analysis type = 0
+application id = 0
+ENABLED = False
+
+[FireeyeScan]
+fireeye images = ['win7-sp1', 'win7x64-sp1', 'winxp-sp2', 'winxp-sp3']
+ENABLED = False
+good path = good
+base path = /mnt/fireeyeshare/
+bad path = bad
+src path = src
+
+[VxStream]
+ENABLED = False
+BASE URL = http://localhost
+API URL = http://localhost/api/
+API key = 
+API secret = 
+Environment ID = 1
+Verify = False
+timeout = 360
+running timeout = 120
+
+[ExifToolsScan]
+cmdline = ['-t']
+path = C:\exiftool.exe
+key = /opt/multiscanner/etc/id_rsa
+host = ('MultiScanner', 22, 'User')
+replacement path = X:\
+remove-entry = ['ExifTool Version Number', 'File Name', 'Directory', 'File Modification Date/Time', 'File Creation Date/Time', 'File Access Date/Time', 'File Permissions']
+ENABLED = False
+
+[PEFile]
+ENABLED = True
+
+[Tika]
+ENABLED = False
+remove-entry = ['X-TIKA:parse_time_millis']
+
+[TrID]
+path = /opt/trid/trid
+ENABLED = False
+key = /opt/multiscanner/etc/id_rsa
+cmdline = ['-r:3']
+host = ('MultiScanner', 22, 'User')
+replacement path = X:\
+
+[flarefloss]
+ENABLED = False
+path = /opt/floss
+cmdline = ['--show-metainfo']
+
+[libmagic]
+magicfile = None
+ENABLED = True
+
+[pdfinfo]
+ENABLED = True
+fast = False
+
+[pehasher]
+ENABLED = True
+
+[YaraScan]
+ruledir = /opt/multiscanner/etc/yarasigs
+fileextensions = ['.yar', '.yara', '.sig']
+ignore-tags = ['TLPRED']
+includes = False
+ENABLED = True
+
+[main]
+copyfilesto = False
+group-types = ['Antivirus']
+storage-config = /opt/multiscanner/storage.ini
+api-config = /opt/multiscanner/api_config.ini
+web-config = /opt/multiscanner/web_config.ini
+
diff --git a/docker_utils/storage.ini b/docker_utils/storage.ini
@@ -0,0 +1,12 @@
+[File]
+ENABLED = False
+path = report.json
+gzip = False
+files-per-line = 1
+
+[ElasticSearchStorage]
+ENABLED = True
+host = elastic
+port = 9200
+index = multiscanner_reports
+doc_type = report
diff --git a/docker_utils/web_config.ini b/docker_utils/web_config.ini
@@ -0,0 +1,8 @@
+[web]
+HOST = 0.0.0.0
+PORT = 8000
+API_LOC = http://localhost:8080
+DEBUG = False
+METADATA_FIELDS = ['Submitter Name', 'Submission Description', 'Submitter Email', 'Submitter Organization', 'Submitter Phone']
+TAGS = ['Malware', 'Benign']
+
diff --git a/docs/analytics.md b/docs/analytics.md
@@ -0,0 +1,47 @@
+# Analytics #
+Enabling analytics and advanced queries is the primary advantage of running 
+several tools against a sample, extracting as much information as possible, and
+storing the output in a common datastore.
+
+The following are some example types of analytics and queries that may be of
+interest:
+
+- cluster samples
+- outlier samples
+- samples for deep-dive analysis
+- gaps in current toolset
+- machine learning analytics on tool outputs
+- others
+
+## ssdeep Comparison ##
+Fuzzy hashing is an effective method to identify similar files based on common
+byte strings despite changes in the byte order and strcuture of the files.
+[ssdeep](https://ssdeep-project.github.io/ssdeep/index.html) provides a fuzzy
+hash implementation and provides the capability to compare hashes.
+
+Comparing ssdeep hashes at scale is a challenge. [[1]](https://www.virusbulletin.com/virusbulletin/2015/11/optimizing-ssdeep-use-scale/)
+originally described a method for comparing ssdeep hashes at scale.
+
+The ssdeep analytic computes ```ssdeep.compare``` for all samples where the
+result is non-zero and provides the capability to return all samples clustered
+based on the ssdeep hash.
+
+### Elasticsearch ###
+When possible, it can be effective to push work to the Elasticsearch cluster
+which support horizontal scaling. For the ssdeep comparison, Elasticsearch 
+[NGram  Tokenizers](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-ngram-tokenizer.html)
+are used to compute 7-grams of the chunk and double-chunk portions
+of the ssdeep hash as described here [[2]](http://www.intezer.com/intezer-community-tip-ssdeep-comparisons-with-elasticsearch/).
+This prevents ever comparing two ssdeep hashes where the result will be zero.
+
+### Python ###
+Because we need to compute ```ssdeep.compare```, the ssdeep analytic cannot be
+done entirely in Elasticsearch. Python is used to query Elasicsearch, compute
+```ssdeep.compare``` on the results, and update the documents in Elasticsearch.
+
+### Deployment ###
+[celery beat](http://docs.celeryproject.org/en/latest/userguide/periodic-tasks.html)
+is used to schedule and kick off the ssdeep comparison task nightly at 2am
+local time, when the system is experiencing less load from users. This ensures
+that the analytic will be run on all samples without adding an exorbinant load
+to the system.
diff --git a/docs/distributed_ms_diagram.PNG b/docs/distributed_ms_diagram.PNG
diff --git a/docs/distributed_multiscanner.md b/docs/distributed_multiscanner.md
@@ -0,0 +1,65 @@
+# Distributed MultiScanner #
+MultiScanner is a file analysis framework that assists the user in evaluating a set of files by automatically running a suite of tools for the user and aggregating the output. Tools can be custom built python scripts, web APIs, software running on another machine, etc. Tools are incorporated by creating modules that run in the MultiScanner framework.
+
+Modules are designed to be quickly written and easily incorporated into the framework. Currently written and maintained modules are related to malware analytics, but the framework is not limited to that scope. For a list of modules you can look in [modules](../modules), descriptions and config options can be found in [modules.md](modules.md).
+
+MultiScanner also supports a distributed workflow for sample storage, analysis, and report viewing. This functionality includes a web interface, a REST API, a distributed file system (GlusterFS), distributed report storage / searching (ElasticSearch), and distributed task management (Celery / RabbitMQ).
+
+## Intended Use case ##
+Distributed MultiScanner is intended to solve any combination of these problems / use cases:
+
+* Malware repository (i.e, long term storage of binaries and metadata)
+* Scalable analysis capabilities
+  * Every component of the Distributed MultiScanner is designed with scale in mind
+  * Note this does not include the following:
+    * The scaling of external malware analysis tools such as Cuckoo
+    * Does not perform auto-scaling (e.g. auto-provisioning of VM’s, etc)
+      * New nodes must be deployed manually and added to the Ansible playbook to receive the proper configurations
+* Enable analytics on malware samples
+  * Either by interacting with the ElasticSearch backend or plugging into the web / REST UI
+  * Cyber Threat Intelligence (CTI) integration / storage
+* Export CTI
+  * Intend to output reports in multiple formats: STIX, MAEC, PDF, HTML, and JSON
+    * Currently support JSON, MAEC 5.0, and HTML
+  * Enables sharing of malware analysis results
+* Support file submission types:
+  * Currently support all file formats (e.g. PE, PDF, Office, etc…)
+  * Currently doesn’t support extraction of files from PCAP / memory dumps / other data streams (but this is in the dev plan)
+* Intended users:
+  * Security Operations Centers (SOCs)
+  * Malware analysis centers
+  * CTI sharing organizations
+
+## Architecture ##
+This is the current architecture:
+
+![alt text](https://raw.githubusercontent.com/awest1339/multiscanner/celery/docs/distributed_ms_diagram.PNG)
+
+When a sample is submitted (either via the web UI or the REST API), the sample is saved to the distributed file system (GlusterFS), a task is added to the distributed task queue (Celery), and an entry is added to the task management database (PostgreSQL). The worker nodes (Celery clients) all have the GlusterFS mounted, which gives them access to the samples for scanning. In our setup, we colocate the worker nodes with the GlusterFS nodes in order to reduce the network load of workers pulling samples from GlusterFS. When a new task is added to the Celery task queue, one of the worker nodes will pull the task and retrieve the corresponding sample from the GlusterFS via its SHA256 value. The worker node then performs the scanning work. Modules can be enabled / disabled via a configuration file. This configuration file is distributed to the workers by Ansible at setup time (details on this process later). When the worker finishes its scans, it will generate a JSON blob and index it into ElasticSearch for permanent storage. It will then update the task management database with a status of "Complete". The user will then be able view the report via the web interface or retrieve the raw JSON.
+
+## Setup ##
+Currently, we deploy this system with Ansible. More information about that process can be found [here](https://github.com/mitre/multiscanner-ansible). We are also currently working to support deploying the distributed architecture via Docker. If you wish to get an idea of how the system works without having to go through the full process of setting up the distributed architecture, look into our docker containers for a standalone [system](docker_standalone.md). Obviously, the standalone system will be far less scalable / robust / feature-rich. However, it will stand up the web UI, the REST API, and an ElasticSearch node for you to see how the system works. The standalone container is intended as an introduction to the system and its capabilities, but not designed for use in production.
+
+## Architecture Details ##
+What follows is a brief discussion of the tools and design choices we made in the creation of this system.
+
+### Web Frontend ###
+The web application runs on [Flask](http://flask.pocoo.org/), uses [Bootstrap](https://getbootstrap.com/) and [jQuery](https://jquery.com/), and served via Apache. It is essentially an aesthetic wrapper around the REST API; all data and services provided are also available by querying the REST API.
+
+### REST API ###
+The REST API is also powered by Flask and served via Apache. It has an underlying PostgreSQL database in order to facilitate task tracking. Additionally, it acts as a gateway to the backend ElasticSearch document store. Searches entered into the web UI will be routed through the REST API and passed to the ElasticSearch cluster. This abstracts the complexity of querying ElasticSearch and gives the user a simple web interface to work with.
+
+### Task Queue ###
+We use Celery as our distributed task queue. 
+
+### Task Tracking ###
+PostgreSQL is our task management database. It is here that we keep track of scan times, samples, and the status of tasks (pending, complete, failed).
+
+### Distributed File System ###
+GlusterFS is our distributed file system. Each component that needs access to the raw samples mounts the share via FUSE. We selected GlusterFS because it is much more performant in our use case of storing a large number of small samples than a technology like HDFS would be.
+
+### Worker Nodes ###
+The worker nodes are simply Celery clients running the MultiScanner Python application. Addtionally, we implemented some batching within Celery to improve the performance of our worker nodes (which operate better at scale). Worker nodes will wait until there are 100 samples in its queue or 60 seconds have passed (whichever happens first) before kicking off its scan. These figures are configurable.OB
+
+### Report Storage ###
+We use ElasticSearch to store the results of our file scans. This is where the true power of this system comes in. ElasticSearch allows for performant, full text searching across all our reports and modules. This allows fast access to interesting details from your malware analysis tools, pivoting between samples, and powerful analytics on report output.