diff --git a/analytics/ssdeep_analytics.py b/analytics/ssdeep_analytics.py new file mode 100644 index 00000000..29792973 --- /dev/null +++ b/analytics/ssdeep_analytics.py @@ -0,0 +1,284 @@ +#!/usr/bin/env python + +''' +Set of analytics based on ssdeep hash. + +- compare + Simple implementation of ssdeep comparisions using a few optimizations + described at the links below + + https://www.virusbulletin.com/virusbulletin/2015/11/optimizing-ssdeep-use-scale + http://www.intezer.com/intezer-community-tip-ssdeep-comparisons-with-elasticsearch/ + + Designed to be run on a regular basis (e.g., nightly). + + For each sample that has not run ssdeep analytic, search for samples where + ssdeep.compare > 0 based on chunksize, chunk 7grams, and double-chunk + 7grams. Update sample with any matches and mark ssdeep analytic as having + run. + +- group + Returns SHA256 hashes of samples grouped based on ssdeep hash. +''' + +import sys +import os +import argparse +import requests +import json +import ssdeep +import configparser +from pprint import pprint + +MS_WD = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if os.path.join(MS_WD, 'storage') not in sys.path: + sys.path.insert(0, os.path.join(MS_WD, 'storage')) +if MS_WD not in sys.path: + sys.path.insert(0, os.path.join(MS_WD)) + +import multiscanner +import common +import elasticsearch_storage + + +class SSDeepAnalytic: + + def __init__(self, debug=False): + storage_conf = multiscanner.common.get_config_path(multiscanner.CONFIG, 'storage') + config_object = configparser.SafeConfigParser() + config_object.optionxform = str + config_object.read(storage_conf) + conf = common.parse_config(config_object) + storage_handler = multiscanner.storage.StorageHandler(configfile=storage_conf) + es_handler = None + for handler in storage_handler.loaded_storage: + if isinstance(handler, elasticsearch_storage.ElasticSearchStorage): + es_handler = handler + break + + if not es_handler: + print('[!] ERROR: This analytic only works with ES stroage module.') + sys.exit(0) + + # probably not ideal... + self.es = es_handler.es + self.index = conf['ElasticSearchStorage']['index'] + self.doc_type= 'sample' + + self.debug = debug + + def ssdeep_compare(self): + # get all of the samples where ssdeep_compare has not been run + # e.g., ssdeepmeta.analyzed == false + query = { + '_source': ['ssdeep', 'SHA256'], + 'query': { + 'bool': { + 'must': [ + { 'match': { 'ssdeep.analyzed': 'false' }} + ] + } + } + } + + page = self.es.search( + self.index, + scroll='2m', + size=1000, + body=query) + + records_list = [] + while len(page['hits']['hits']) > 0: + for hit in page['hits']['hits']: + records_list.append(hit) + sid = page['_scroll_id'] + page = self.es.scroll(scroll_id=sid, scroll='2m') + + for new_ssdeep_hit in records_list: + new_ssdeep_hit_src = new_ssdeep_hit.get('_source') + chunksize = new_ssdeep_hit_src.get('ssdeep').get('chunksize') + chunk = new_ssdeep_hit_src.get('ssdeep').get('chunk') + double_chunk = new_ssdeep_hit_src.get('ssdeep').get('double_chunk') + new_sha256 = new_ssdeep_hit_src.get('SHA256') + + # build new query for docs that match our optimizations + # https://github.com/intezer/ssdeep-elastic/blob/master/ssdeep_elastic/ssdeep_querying.py#L35 + opti_query = { + '_source': ['ssdeep', 'SHA256'], + 'query': { + 'bool': { + 'must': [ + { + 'terms': { + 'ssdeep.chunksize': [chunksize, chunksize / 2, chunksize * 2] + } + }, + { + 'bool': { + 'should': [ + { + 'match': { + 'ssdeep.chunk': { + 'query': chunk + } + } + }, + { + 'match': { + 'ssdeep.double_chunk': { + 'query': double_chunk + } + } + } + ], + 'minimum_should_match': 1 + } + }, + { + 'bool': { + 'must_not': { + 'match': { + 'SHA256': new_sha256 + } + } + } + } + ] + } + } + } + + # this bool condition isn't working how I expect + # if we have already updated the match dictionary to + # include a hit, don't rerun it for the inverse + # { + # 'bool': { + # 'must_not': { + # 'exists': { + # 'field': 'ssdeep.matches.' + new_sha256 + # } + # } + # } + # } + + opti_page = self.es.search( + self.index, + scroll='2m', + size=1000, + body=opti_query) + + while len(opti_page['hits']['hits']) > 0: + # for each hit, ssdeep.compare != 0; update the matches + for opti_hit in opti_page['hits']['hits']: + opti_hit_src = opti_hit.get('_source') + opti_sha256 = opti_hit_src.get('SHA256') + result = ssdeep.compare( + new_ssdeep_hit_src.get('ssdeep').get('ssdeep_hash'), + opti_hit_src.get('ssdeep').get('ssdeep_hash')) + + if self.debug: + print( + new_ssdeep_hit_src.get('SHA256'), + opti_hit_src.get('SHA256'), + result) + + msg = { 'doc': { 'ssdeep': { 'matches': { opti_sha256: result } } } } + self.es.update( + index=self.index, + doc_type=self.doc_type, + id=new_ssdeep_hit.get('_id'), + body=json.dumps(msg)) + + msg = { 'doc': { 'ssdeep': { 'matches': { new_sha256: result } } } } + self.es.update( + index=self.index, + doc_type=self.doc_type, + id=opti_hit.get('_id'), + body=json.dumps(msg)) + + opti_sid = opti_page['_scroll_id'] + opti_page = self.es.scroll(scroll_id=opti_sid, scroll='2m') + + # analytic has run against sample, set ssdeep.analyzed = true + msg = { 'doc': { 'ssdeep': { 'analyzed': 'true'} } } + self.es.update( + index=self.index, + doc_type=self.doc_type, + id=new_ssdeep_hit.get('_id'), + body=json.dumps(msg)) + + def ssdeep_group(self): + # get all of the samples where ssdeep_compare has not been run + # e.g., ssdeepmeta.analyzed == false + query = { + '_source': ['ssdeep', 'SHA256'], + 'query': { + 'exists': { + 'field': 'ssdeep.matches' + } + } + } + + page = self.es.search( + self.index, + scroll='2m', + size=1000, + body=query) + + records = {} + while len(page['hits']['hits']) > 0: + for hit in page['hits']['hits']: + hit_src = hit.get('_source') + records[hit_src.get('SHA256')] = hit_src.get('ssdeep', {}) \ + .get('matches', {}) + sid = page['_scroll_id'] + page = self.es.scroll(scroll_id=sid, scroll='2m') + + # inspired by ssdc + groups = [] + for sha256_, matches_dict in records.items(): + in_group = False + for i in range(len(groups)): + if sha256_ in groups: + in_group = True + continue + should_add = True + for match_hash in groups[i]: + if match_hash not in records.get(sha256_): + should_add = False + if should_add: + groups[i].append(sha256_) + in_group = True + if not in_group: + groups.append([sha256_]) + + return groups + + +def main(): + parser = argparse.ArgumentParser(description='Script to interact with ' + 'Multiscanner\'s Elasticsearch datastore to run analytics based on ' + 'ssdeep hash.') + group = parser.add_mutually_exclusive_group(required=True) + parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', + help='Increase output to stdout') + group.add_argument('-c', '--compare', dest='compare', action='store_true', + help='Run ssdeep.compare using a few optimizations based on ssdeep' + ' hash structure.') + group.add_argument('-g', '--group', dest='group', action='store_true', + help='Returns group of samples based on ssdeep hash.') + + args = parser.parse_args() + + ssdeep_analytic = SSDeepAnalytic(debug=args.verbose) + + if args.compare: + ssdeep_analytic.ssdeep_compare() + print('[*] Success') + elif args.group: + pprint(ssdeep_analytic.ssdeep_group()) + print('[*] Success') + + +if __name__ == '__main__': + main() diff --git a/analytics/ssdeep_compare.py b/analytics/ssdeep_compare.py deleted file mode 100644 index 75630e73..00000000 --- a/analytics/ssdeep_compare.py +++ /dev/null @@ -1,205 +0,0 @@ -#!/usr/bin/env python - -''' -Simple implementation of ssdeep comparisions using a few optimizations -described at the links below - -https://www.virusbulletin.com/virusbulletin/2015/11/optimizing-ssdeep-use-scale -http://www.intezer.com/intezer-community-tip-ssdeep-comparisons-with-elasticsearch/ - -Designed to be run on a regular basis (e.g., nightly). - -For each sample that has not run ssdeep analytic, search for samples where -ssdeep.compare > 0 based on chunksize, chunk 7grams, and double-chunk 7grams. -Update sample with any matches and mark ssdeep analytic as having run -''' - -import sys -import os -import argparse -import requests -import json -import ssdeep -import configparser - -MS_WD = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -if os.path.join(MS_WD, 'storage') not in sys.path: - sys.path.insert(0, os.path.join(MS_WD, 'storage')) -if MS_WD not in sys.path: - sys.path.insert(0, os.path.join(MS_WD)) - -import multiscanner -import common -import elasticsearch_storage - - -def main(): - parser = argparse.ArgumentParser(description='Script to interact with Multiscanner\'s ' - 'Elasticsearch datastore to run ssdeep.compare using a few ' - 'optimizations based on ssdeep hash structure.') - parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', - help='Increase output to stdout') - args = parser.parse_args() - - storage_conf = multiscanner.common.get_config_path(multiscanner.CONFIG, 'storage') - config_object = configparser.SafeConfigParser() - config_object.optionxform = str - config_object.read(storage_conf) - conf = common.parse_config(config_object) - storage_handler = multiscanner.storage.StorageHandler(configfile=storage_conf) - es_handler = None - for handler in storage_handler.loaded_storage: - if isinstance(handler, elasticsearch_storage.ElasticSearchStorage): - es_handler = handler - break - - if not es_handler: - print('[!] ERROR: This analytic only works with ES stroage module.') - sys.exit(0) - - # probably not ideal... - ES = es_handler.es - INDEX = conf['ElasticSearchStorage']['index'] - DOC_TYPE = 'sample' - - # get all of the samples where ssdeep_compare has not been run - # e.g., ssdeepmeta.analyzed == false - query = { - '_source': ['ssdeep', 'SHA256'], - 'query': { - 'bool': { - 'must': [ - { 'match': { 'ssdeep.analyzed': 'false' }} - ] - } - } - } - - page = ES.search( - INDEX, - scroll='2m', - size=1000, - body=query) - - records_list = [] - while len(page['hits']['hits']) > 0: - for hit in page['hits']['hits']: - records_list.append(hit) - sid = page['_scroll_id'] - page = ES.scroll(scroll_id=sid, scroll='2m') - - for new_ssdeep_hit in records_list: - new_ssdeep_hit_src = new_ssdeep_hit.get('_source') - chunksize = new_ssdeep_hit_src.get('ssdeep').get('chunksize') - chunk = new_ssdeep_hit_src.get('ssdeep').get('chunk') - double_chunk = new_ssdeep_hit_src.get('ssdeep').get('double_chunk') - new_sha256 = new_ssdeep_hit_src.get('SHA256') - - # build new query for docs that match our optimizations - # https://github.com/intezer/ssdeep-elastic/blob/master/ssdeep_elastic/ssdeep_querying.py#L35 - opti_query = { - '_source': ['ssdeep', 'SHA256'], - 'query': { - 'bool': { - 'must': [ - { - 'terms': { - 'ssdeep.chunksize': [chunksize, chunksize / 2, chunksize * 2] - } - }, - { - 'bool': { - 'should': [ - { - 'match': { - 'ssdeep.chunk': { - 'query': chunk - } - } - }, - { - 'match': { - 'ssdeep.double_chunk': { - 'query': double_chunk - } - } - } - ], - 'minimum_should_match': 1 - } - }, - { - 'bool': { - 'must_not': { - 'match': { - 'SHA256': new_sha256 - } - } - } - } - ] - } - } - } - - # this bool condition isn't working how I expect - # if we have already updated the match dictionary to - # include a hit, don't rerun it for the inverse - # { - # 'bool': { - # 'must_not': { - # 'exists': { - # 'field': 'ssdeep.matches.' + new_sha256 - # } - # } - # } - # } - - opti_page = ES.search( - INDEX, - scroll='2m', - size=1000, - body=opti_query) - - while len(opti_page['hits']['hits']) > 0: - # for each hit, ssdeep.compare != 0; update the matches - for opti_hit in opti_page['hits']['hits']: - opti_hit_src = opti_hit.get('_source') - opti_sha256 = opti_hit_src.get('SHA256') - result = ssdeep.compare( - new_ssdeep_hit_src.get('ssdeep').get('ssdeep_hash'), - opti_hit_src.get('ssdeep').get('ssdeep_hash')) - - if args.verbose: - print( - new_ssdeep_hit_src.get('SHA256'), - opti_hit_src.get('SHA256'), - result) - - msg = { 'doc': { 'ssdeep': { 'matches': { opti_sha256: result } } } } - ES.update( - index=INDEX, - doc_type=DOC_TYPE, - id=new_ssdeep_hit.get('_id'), - body=json.dumps(msg)) - - msg = { 'doc': { 'ssdeep': { 'matches': { new_sha256: result } } } } - ES.update( - index=INDEX, - doc_type=DOC_TYPE, - id=opti_hit.get('_id'), - body=json.dumps(msg)) - - opti_sid = opti_page['_scroll_id'] - opti_page = ES.scroll(scroll_id=opti_sid, scroll='2m') - - # analytic has run against sample, set ssdeep.analyzed = true - msg = { 'doc': { 'ssdeep': { 'analyzed': 'true'} } } - ES.update( - index=INDEX, - doc_type=DOC_TYPE, - id=new_ssdeep_hit.get('_id'), - body=json.dumps(msg)) - -if __name__ == '__main__': - main() diff --git a/modules/Detonation/Cuckoo.py b/modules/Detonation/Cuckoo.py index 9d647ef7..942cdf11 100644 --- a/modules/Detonation/Cuckoo.py +++ b/modules/Detonation/Cuckoo.py @@ -47,7 +47,7 @@ def scan(filelist, conf=DEFAULTCONF): report_url = api_url + 'tasks/report/' view_url = api_url + 'tasks/view/' delete_url = api_url + 'tasks/delete/' - maec_report_url = api_url + 'tasks/report/{task_id}/maec' + maec_report_url = 'View the Cuckoo MAEC report' web_report_url = 'View the report in Cuckoo' for fname in filelist: @@ -75,9 +75,10 @@ def scan(filelist, conf=DEFAULTCONF): if status == 'reported': report = fetch_report_json(report_url+task_id) if conf['maec']: - maec_report = fetch_report_json( - maec_report_url.format(task_id=task_id)) - report['maec'] = maec_report + report['info']['maec report'] = maec_report_url.format(task_id=task_id) + # maec_report = fetch_report_json( + # maec_report_url.format(task_id=task_id)) + # report['maec'] = maec_report # TODO - should we just modify Cuckoo to add this itself? if report.get('info'): report['info']['web_report'] = web_report_url.format( diff --git a/utils/api.py b/utils/api.py index 86405641..c6a07a3e 100755 --- a/utils/api.py +++ b/utils/api.py @@ -18,6 +18,7 @@ GET /api/v1/tasks/search/ ---> receive list of most recent report for matching samples GET /api/v1/tasks/search/history ---> receive list of most all reports for matching samples GET /api/v1/tasks//file?raw={t|f} ----> download sample, defaults to passwd protected zip +GET /api/v1/tasks//maec ----> download the Cuckoo MAEC 5.0 report, if it exists GET /api/v1/tasks//notes ---> Receive list of this task's notes POST /api/v1/tasks//notes ---> Add a note to task PUT /api/v1/tasks//notes/ ---> Edit a note @@ -25,6 +26,8 @@ GET /api/v1/tasks//report?d={t|f}---> receive report in JSON, set d=t to download POST /api/v1/tasks//tags ---> Add tags to task DELETE /api/v1/tasks//tags ---> Remove tags from task +GET /api/v1/analytics/ssdeep_compare---> Run ssdeep.compare analytic +GET /api/v1/analytics/ssdeep_group---> Receive list of sample hashes grouped by ssdeep hash The API endpoints all have Cross Origin Resource Sharing (CORS) enabled. By default it will allow requests from any port on localhost. Change this setting @@ -53,16 +56,22 @@ from six import PY3 import rarfile import zipfile +import requests MS_WD = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if os.path.join(MS_WD, 'storage') not in sys.path: sys.path.insert(0, os.path.join(MS_WD, 'storage')) +if os.path.join(MS_WD, 'analytics') not in sys.path: + sys.path.insert(0, os.path.join(MS_WD, 'analytics')) +if os.path.join(MS_WD, 'libs') not in sys.path: + sys.path.insert(0, os.path.join(MS_WD, 'libs')) if MS_WD not in sys.path: sys.path.insert(0, os.path.join(MS_WD)) import multiscanner import sql_driver as database import elasticsearch_storage +import common TASK_NOT_FOUND = {'Message': 'No task or report with that ID found!'} INVALID_REQUEST = {'Message': 'Invalid request parameters'} @@ -114,6 +123,7 @@ def default(self, obj): # Needs api_config in order to function properly from celery_worker import multiscanner_celery +from ssdeep_analytics import SSDeepAnalytic db = database.Database(config=api_config.get('Database')) # To run under Apache, we need to set up the DB outside of __main__ @@ -125,6 +135,12 @@ def default(self, obj): if isinstance(handler, elasticsearch_storage.ElasticSearchStorage): break +ms_config_object = configparser.SafeConfigParser() +ms_config_object.optionxform = str +ms_configfile = multiscanner.CONFIG +ms_config_object.read(ms_configfile) +ms_config = common.parse_config(ms_config_object) + try: DISTRIBUTED = api_config['api']['distributed'] except KeyError: @@ -259,9 +275,17 @@ def task_list(): def search(params, get_all=False): # Pass search term to Elasticsearch, get back list of sample_ids - search_term = params['search[value]'] + sample_id = params.get('sha256') + if sample_id: + task_id = db.exists(sample_id) + if task_id: + return { 'TaskID' : task_id } + else: + return TASK_NOT_FOUND + + search_term = params.get('search[value]') search_type = params.pop('search_type', 'default') - if search_term == '': + if not search_term: es_result = None else: es_result = handler.search(search_term, search_type) @@ -559,21 +583,23 @@ def _add_links(report_dict): .get('matches', {}) if matches_dict: + links_dict = {} # k=SHA256, v=ssdeep.compare result for k, v in matches_dict.items(): t_id = db.exists(k) if t_id: - matches_dict.pop(k) url = '{h}/report/{t_id}'.format( h=web_loc, t_id=t_id) href = '{sha256}'.format( url=url, sha256=k) - matches_dict[href] = v + links_dict[href] = v + else: + links_dict[k] = v # replace with updated dict - report_dict['Report']['ssdeep']['matches'] = matches_dict + report_dict['Report']['ssdeep']['matches'] = links_dict return report_dict @@ -594,6 +620,32 @@ def files_get_task(task_id): return jsonify({'Error': 'sha256 not in report!'}) +@app.route('/api/v1/tasks//maec', methods=['GET']) +def get_maec_report(task_id): + # try to get report dict + report_dict, success = get_report_dict(task_id) + if not success: + return jsonify(report_dict) + + # okay, we have report dict; get cuckoo task ID + try: + cuckoo_task_id = report_dict['Report']['Cuckoo Sandbox']['info']['id'] + except KeyError: + return jsonify({'Error': 'No MAEC report found for that task!'}) + + # Get the MAEC report from Cuckoo + try: + maec_report = requests.get( + '{}/v1/tasks/report/{}/maec'.format(ms_config.get('Cuckoo', {}).get('API URL', ''), cuckoo_task_id) + ) + except: + return jsonify({'Error': 'No MAEC report found for that task!'}) + # raw JSON + response = make_response(jsonify(maec_report.json())) + response.headers['Content-Type'] = 'application/json' + response.headers['Content-Disposition'] = 'attachment; filename=%s.json' % task_id + return response + def get_report_dict(task_id): task = db.get_task(task_id) if not task: @@ -781,6 +833,33 @@ def files_get_sha256_helper(sha256, raw=None): response.headers['Content-Disposition'] = 'inline; filename={}.zip'.format(sha256) return response +@app.route('/api/v1/analytics/ssdeep_compare', methods=['GET']) +def run_ssdeep_compare(): + ''' + Runs ssdeep compare analytic and returns success / error message. + ''' + try: + ssdeep_analytic = SSDeepAnalytic() + ssdeep_analytic.ssdeep_compare() + return make_response(jsonify({ 'Message': 'Success' })) + except Exception as e: + return make_response( + jsonify({'Message': 'Unable to complete request.'}), + HTTP_BAD_REQUEST) + +@app.route('/api/v1/analytics/ssdeep_group', methods=['GET']) +def run_ssdeep_group(): + ''' + Runs sssdeep group analytic and returns list of groups as a list. + ''' + try: + ssdeep_analytic = SSDeepAnalytic() + groups = ssdeep_analytic.ssdeep_group() + return make_response(jsonify({ 'groups': groups })) + except Exception as e: + return make_response( + jsonify({'Message': 'Unable to complete request.'}), + HTTP_BAD_REQUEST) if __name__ == '__main__': diff --git a/web/app.py b/web/app.py index 8d8bbbd2..e658701f 100644 --- a/web/app.py +++ b/web/app.py @@ -84,6 +84,11 @@ def history(): return render_template('history.html', api_loc=app.config['API_LOC']) +@app.route('/analytics', methods=['GET']) +def analytics(): + return render_template('analytics.html', api_loc=app.config['API_LOC']) + + if __name__ == "__main__": app.run(debug=app.config['DEBUG'], port=app.config['PORT'], diff --git a/web/static/css/styles.css b/web/static/css/styles.css index 012d29d7..416d30cb 100644 --- a/web/static/css/styles.css +++ b/web/static/css/styles.css @@ -507,6 +507,7 @@ width: 50px; overflow: hidden; word-wrap: break-word; + font-family: 'Inconsolata', 'Consolas', monospace; } table.dataTable thead .sorting:after { diff --git a/web/templates/analytics.html b/web/templates/analytics.html new file mode 100644 index 00000000..6ba60b4d --- /dev/null +++ b/web/templates/analytics.html @@ -0,0 +1,74 @@ +{% extends "layout.html" %} + +{% block head %} + +{% endblock %} + +{% block title %}Analytics{% endblock %} + +{% block content %} +
+
+
+ +
+
+

SSDeep Groups

+
+ + + + + + + + + + +
GroupSHA256
+
+
+{% endblock %} diff --git a/web/templates/layout.html b/web/templates/layout.html index cd465207..c95eb02f 100644 --- a/web/templates/layout.html +++ b/web/templates/layout.html @@ -44,6 +44,7 @@
  • Scan
  • Analyses
  • History
  • +
  • Analytics