Source code for androlyze.storage.resultdb.ResultDatabaseStorage


# encoding: utf-8

__author__ = "Nils Tobias Schmidt"
__email__ = "schmidt89 at informatik.uni-marburg.de"

from collections import OrderedDict
import os
import ssl
import sys

from androlyze.analyze import AnalyzeUtil
from androlyze.log.Log import log
from androlyze.model.analysis.result.StaticResultKeys import *
from androlyze.model.android.apk.FastApk import FastApk
from androlyze.storage.apk.ApkCopyInterface import ApkCopyInterface
from androlyze.storage.exception import DatabaseOpenError, \
    DatabaseDeleteException, DatabaseStoreException, DatabaseLoadException
from androlyze.storage.resultdb import MongoUtil
from androlyze.storage.resultdb.MongoUtil import escape_keys, \
    MONGODB_IN_OPERATOR
from androlyze.storage.resultdb.ResultsStorageInterface import ResultStorageInterface
from bson.errors import BSONError
import gridfs
from gridfs.errors import NoFile
import pymongo
from pymongo.errors import PyMongoError, ConnectionFailure

# collection name for normal documents
RESULT_DOCUMENTS_COLLECTION_NAME = "docs"

# gridfs collections prefix
GRIDFS_COLLS_PREFIX = "fs"

# gridfs chunks collection
GRIDFS_CHUNKS = 'chunks'

# gridfs files metadata key
GRIDFS_FILES_METADATA = 'metadata'
# gridfs files filename key
GRIDFS_FILES_FILENAME = 'filename'

# gridfs files collection
GRIDFS_FILES = "files"

# gridfs collections
FILES_COLL_NAME = '%s.%s' % (GRIDFS_COLLS_PREFIX, GRIDFS_FILES)
CHUNKS_COLL_NAME = '%s.%s' % (GRIDFS_COLLS_PREFIX, GRIDFS_CHUNKS)

# apk database
APK_DB_NAME = 'apks'

# tuple of exceptions that indiciate connection errors
CONNECTION_FAIL_ERRORS = (ConnectionFailure, )

MAX_BSON_SIZE = 16770000

[docs]class ResultDatabaseStorage(object, ResultStorageInterface, ApkCopyInterface): ''' Class for storing documents and/or binary data in mongodb. ''' def __init__(self, db_name = None, dest_addr = None, dest_port = None, # auth username = None, passwd = None, # ssl use_ssl = False, ssl_ca_certs = None, ): ''' Create (if not existing) and open the database and collections. Parameters ---------- db_name : str, optional (default is "res") The name of the database to use. Will be created if not already existing. dest_addr : str, optional (default is '127.0.0.1') Address of mongodb database server. dest_port : int, optional (default is 27017) Port of mongodb database server. username : str, optional (default is None) No authentication at all. passwd : str, optional (default is None) No authentication at all. use_ssl : bool, optional (default is False) Use ssl for the connection. ssl_ca_certs : str, optional (default is None) The CA certificate. Raises ------ DatabaseOpenError ''' # db name not allowed if db_name == APK_DB_NAME: raise DatabaseOpenError(db_name, msg = 'Database name "%s" reserved for apk storage!' % db_name), None, sys.exc_info()[2] # set default values if db_name is None: db_name = 'res' if dest_addr is None: dest_addr = '127.0.0.1' if dest_port is None: dest_port = 27017 try: self.__db_name = db_name self.__dest_addr = dest_addr self.__dest_port = dest_port self.__use_ssl = use_ssl # only pass ssl parameters if ssl enabled ssl_params = dict(ssl = use_ssl, ssl_cert_reqs = ssl.CERT_NONE) if use_ssl else {} # set None cause if connection cannot be initiated, conn var will not in scope self.conn = None self.__conn = conn = pymongo.MongoClient(host = dest_addr, port = dest_port, **ssl_params) # authentication is per database! # do auth before probable db creation etc. if None not in (username, passwd): # authenticate if credentials given log.debug("authenticating with mongodb ...") conn["admin"].authenticate(username, passwd) else: log.debug("not authenticating with mongodb ... no credentials supplied!") self.__db = conn[self.db_name] # apk db self.__apk_db = conn[APK_DB_NAME] self.__apk_coll = gridfs.GridFS(self.__apk_db, GRIDFS_COLLS_PREFIX) # create/open collections self.__res_coll = self._open_res_coll() self.__files_coll = self.__db[GRIDFS_COLLS_PREFIX][GRIDFS_FILES] # grid fs for binary files, supports files > 16 mb self.__grid_fs = self._open_gridfs() # create indexes self._create_idx_for_colls() log.info("Opened database: %s", self) log.debug("CA certificate: %s", ssl_ca_certs) except PyMongoError as e: raise DatabaseOpenError(str(self), caused_by = e), None, sys.exc_info()[2] def __del__(self): ''' Close db connection ''' if self.conn is not None: log.debug("Closing db connection ... ") self.conn.close()
[docs] def get_apk_db(self): return self.__apk_db
[docs] def get_apk_coll(self): return self.__apk_coll
[docs] def set_apk_db(self, value): self.__apk_db = value
[docs] def set_apk_coll(self, value): self.__apk_coll = value
[docs] def del_apk_db(self): del self.__apk_db
[docs] def del_apk_coll(self): del self.__apk_coll
def __str__(self): return '%s: mongodb://%s:%s/%s/?ssl=%s' % (self.__class__.__name__, self.dest_addr, self.dest_port, self.db_name, self.use_ssl)
[docs] def get_db_name(self): return self.__db_name
[docs] def set_db_name(self, value): self.__db_name = value
[docs] def del_db_name(self): del self.__db_name
[docs] def get_files_coll(self): return self.__files_coll
[docs] def set_files_coll(self, value): self.__files_coll = value
[docs] def del_files_coll(self): del self.__files_coll
[docs] def get_grid_fs(self): return self.__grid_fs
[docs] def set_grid_fs(self, value): self.__grid_fs = value
[docs] def del_grid_fs(self): del self.__grid_fs
[docs] def get_dest_addr(self): return self.__dest_addr
[docs] def get_dest_port(self): return self.__dest_port
[docs] def set_dest_addr(self, value): self.__dest_addr = value
[docs] def set_dest_port(self, value): self.__dest_port = value
[docs] def del_dest_addr(self): del self.__dest_addr
[docs] def del_dest_port(self): del self.__dest_port
[docs] def get_res_coll(self): return self.__res_coll
[docs] def set_res_coll(self, value): self.__res_coll = value
[docs] def del_res_coll(self): del self.__res_coll
[docs] def get_conn(self): return self.__conn
[docs] def get_db(self): return self.__db
[docs] def set_conn(self, value): self.__conn = value
[docs] def set_db(self, value): self.__db = value
[docs] def del_conn(self): del self.__conn
[docs] def del_db(self): del self.__db
[docs] def get_use_ssl(self): return self.__use_ssl
db_name = property(get_db_name, set_db_name, del_db_name, "db_name : str, optional (default is 'res') - The name of the database to use. Will be created if not already existing.") dest_addr = property(get_dest_addr, set_dest_addr, del_dest_addr, "str, optional (default is '127.0.0.1') : Address of mongodb database server.") dest_port = property(get_dest_port, set_dest_port, del_dest_port, "int, optional (default is 27017) : Port of mongodb database server.") use_ssl = property(get_use_ssl, None, None, " bool, optional (default is False) : Use ssl for the connection.") conn = property(get_conn, set_conn, del_conn, "pymongo.mongo_client.MongoClient : Mongodb connection") db = property(get_db, set_db, del_db, "pymongo.database.Database : Database") res_coll = property(get_res_coll, set_res_coll, del_res_coll, "pymongo.collection.Collection : results collection for documents") grid_fs = property(get_grid_fs, set_grid_fs, del_grid_fs, "gridfs.GridFS : Gridfs object for non-document and binary storage.") files_coll = property(get_files_coll, set_files_coll, del_files_coll, "pymongo.collection.Collection : files follection of gridfs") apk_db = property(get_apk_db, set_apk_db, del_apk_db, "pymongo.database.Database : Apk database") apk_coll = property(get_apk_coll, set_apk_coll, del_apk_coll, "gridfs.GridFS : Apk collection (gridfs)") ############################################################ #---ResultStorageInterface ############################################################
[docs] def store_result_for_apk(self, apk, script): ''' See doc of :py:meth:`.ResultWritingInterface.store_result_for_apk`. Returns ------- tuple<str, bool> First component is the id of the entry and the second a boolean indication if the result has been stored in gridfs. None If an error occurred. ''' try: # escape keys for mongodb insert res_obj_dict = escape_keys(script.result_dict(gen_id = False)) _id = script.gen_unique_id() # if data is to big or custom result object used -> store with gridfs if script.uses_custom_result_object() or script.is_big_res(): log.debug("storing results for %s, %s in %s (id: %s)", apk.short_description(), script, self.grid_fs, _id) result = self.get_custom_res_obj_representation(script) gridfs = self.grid_fs # gridfs doesn't have an update method -> delete and insert if gridfs.exists(**{RESOBJ_ID : _id}): # delete by _id gridfs.delete(_id) # store file together with metadata from `ResultObject` gridfs.put(result, metadata = res_obj_dict, filename = script.get_file_name(), _id = _id) # return id return _id, True # normal json data else: log.debug("storing results for %s, %s in %s db(id: %s)", apk.short_description(), script, self.res_coll, _id) # set id so we don't have multiple results for same script and apk res_obj_dict[RESOBJ_ID] = _id # update or insert if not existing self.res_coll.update({RESOBJ_ID : _id}, res_obj_dict, upsert = True) # return id return _id, False except (PyMongoError, BSONError) as e: raise DatabaseStoreException(self, "script: %s" % script, caused_by = e), None, sys.exc_info()[2]
[docs] def get_results(self, include_fields = None, exclude_fields = None, where = None, distinct_key = None, n = None, sort = True, latest = False, non_document = False, non_document_raw = False, remove_id_field = True, **kwargs): ''' See doc of :py:meth:`.ResultStorageInterface.get_results` ''' if include_fields is not None and exclude_fields is not None: raise ValueError("include_fields and exclude_fields are mutually exclusive!") if include_fields is None: include_fields = [] if exclude_fields is None: exclude_fields = [] if where is None: where = {} # latest means enable sorting and only return one result if latest: sort = True n = 1 # create projection dict fields = [(p, 0) for p in exclude_fields] + [(p, 1) for p in include_fields] if remove_id_field: # we don't want the id field fields += [(RESOBJ_ID, 0)] select = dict(fields) # no projection criteria given, disable! # because empty dict means only id if not select: select = None where.update(self.create_where_clause(kwargs, from_gridfs = non_document)) try: res_cursor = None # get appropriate collection coll = self.__get_collection(gridfs_files_coll = non_document and not non_document_raw, gridfs_obj = non_document and non_document_raw) # pymongo 3.0 removed the as_class option in the collection.find method # this is the fix find_kwargs = {} if int(pymongo.version[0]) < 3: find_kwargs['as_class'] = OrderedDict # grid fs if non_document: if non_document_raw: log.debug("mongodb query: find(%s) on gridfs", where) res_cursor = coll.find(where) else: # using the gridfs files collection directly enables us projection an attributes log.debug("mongodb query: find(%s, %s) ", where, select) res_cursor = coll.find(where, select, **find_kwargs) # normal collection else: res_cursor = coll.find(where, select, **find_kwargs) log.debug("mongodb query: find(%s, %s) ", where, select) # enable sorting if wanted if sort: # construct sorting criteria structure, structure is different if using gridfs sort_crit = [( MongoUtil.get_attr_str(RESOBJ_SCRIPT_META, RESOBJ_SCRIPT_META_ANALYSIS_DATE, gridfs=non_document) , -1)] res_cursor = res_cursor.sort(sort_crit) # limit results if wanted if n is not None: res_cursor = res_cursor.limit(n) # generator that abstracts if normal collection or is gridfs if non_document: if non_document_raw: return res_cursor if distinct_key is not None: res_cursor = res_cursor.distinct(distinct_key) return res_cursor except PyMongoError as e: raise DatabaseLoadException(self, "find(%s, %s)", where, select, caused_by = e), None, sys.exc_info()[2]
[docs] def get_results_for_ids(self, ids, non_document = False, non_document_raw = False): ''' See :py:method:`.ResultStorageInterface.get_results_for_ids` ''' return self.get_results(where = {RESOBJ_ID : {MONGODB_IN_OPERATOR : ids}}, non_document = non_document, non_document_raw = non_document_raw)
[docs] def delete_results(self, where = None, non_document = False, **kwargs): ''' See doc of :py:meth:`.ResultStorageInterface.delete_results` ''' coll = self.__get_collection(gridfs_obj = non_document) if where is None: where = {} where.update(self.create_where_clause(kwargs, from_gridfs = non_document)) n = 0 try: # do the query log.debug("mongodb remove(%s)", where) # gridfs if non_document: # get ids and delete for _id in self.get_ids(where = where, non_document = non_document): coll.delete(_id) log.debug("Deleted element with id: %s from mongodb gridfs!", _id) n += 1 # normal collection else: write_result = coll.remove(where, getLastError=True) if write_result is not None: n = write_result["n"] return n except PyMongoError as e: log.exception(DatabaseDeleteException(self, where, e)) return n
[docs] def erase_whole_db(self): ''' Use to drop collections and recreate them. See doc of :py:meth:`.ResultStorageInterface.erase_whole_db` ''' self.__recreate_collections(gridfs = True, res_collection = True)
############################################################ #---ApkCopyInterface ############################################################
[docs] def copy_apk(self, apk, file_like_obj, **kwargs): ''' See doc of :py:meth:`.ApkCopyInterface.copy_apk`. Inserts the apk from the `file_like_obj` into mongodb's gridfs, but only if not already in db. Returns ------- The id of the apk (in db) ''' file_like_obj.seek(0) try: gridfs = self.__apk_coll # escape keys accoring to mongodb rules apk_meta = escape_keys(apk.meta_dict()) _id = apk.hash # gridfs doesn't have an update method -> delete and insert if not gridfs.exists(**{RESOBJ_ID : _id}): # store file together with metadata filename = os.path.basename(apk_meta[RESOBJ_APK_META][RESOBJ_APK_META_PATH]) gridfs.put(file_like_obj, metadata = apk_meta[RESOBJ_APK_META], filename = filename, _id = _id, chunkSize = MAX_BSON_SIZE) log.info("put %s into %s", apk.short_description(), self) except (PyMongoError, BSONError) as e: raise DatabaseStoreException(self, "apk: %s" % apk.short_description(), caused_by = e), None, sys.exc_info()[2] # return id return _id
[docs] def get_apk(self, _hash, **kwargs): ''' Get the `EAndroApk` from `_hash`. Parameters ---------- _hash : str Hash of the .apk (sha256) Raises ------ DatabaseLoadException NoFile If the file is not present. Returns ------- EAndroApk Apk constructed from raw data and meta infos. ''' try: gridfs = self.__apk_coll log.info("getting apk: %s from mongodb ...", _hash) gridfs_obj = gridfs.get(_hash) # get raw .apk apk_zipfile = gridfs_obj.read() # get apk meta infos apk_meta = gridfs_obj.metadata package_name, version_name, path, _hash, import_date, tag = apk_meta[RESOBJ_APK_META_PACKAGE_NAME], apk_meta[RESOBJ_APK_META_VERSION_NAME], apk_meta[RESOBJ_APK_META_PATH], apk_meta[RESOBJ_APK_META_HASH], apk_meta[RESOBJ_APK_META_IMPORT_DATE], apk_meta[RESOBJ_APK_META_TAG] # use to hold apk meta infos fast_apk = FastApk(package_name, version_name, path, _hash, import_date, tag) eandro_apk = AnalyzeUtil.open_apk(apk_zipfile, fast_apk, raw = True) log.info("got apk") return eandro_apk except NoFile: raise except PyMongoError as e: raise DatabaseLoadException(self, content = "Apk (hash=%s)" % _hash, caused_by = e), None, sys.exc_info()[2]
############################################################ #---MongoDB query builder helper functions ############################################################
[docs] def create_where_clause(self, kwargs, from_gridfs = False): ''' Create where clause from `kwargs`. Parameters ---------- from_gridfs : bool, optional (default is False) Whether to build where clause for gridfs. Other Parameters ---------------- package_name : str, optional (default is None) apk_hash : str, optional (default is None) version_name : str, optional (default is None) tag : str, optional (default is None) script_hash : str, optional (default is None) script_name : str, optional (default is None) script_version : str, optional (default is None) Notes ----- If any of the other parameters is None it won't be used for filtering. Returns ------- ''' # create filter dict wheres = [] wheres += MongoUtil.build_apk_meta_where(kwargs, gridfs = from_gridfs) wheres += MongoUtil.build_script_meta_where(kwargs, gridfs = from_gridfs) return dict(wheres)
############################################################ #---Other ############################################################
[docs] def get_eandro_apk(self, _id): ''' Get the `EAndroApk` from database. Returns ------- EAndroApk None If Apk could not be loaded ''' try: return self.get_apk(_id) except (DatabaseLoadException, NoFile) as e: log.warn(e)
############################################################ #---Helper stuff ############################################################ def _create_idx_for_colls(self): ''' Create index(es) for the collections ''' def create_idx(coll): ''' Create index on a single collection ''' # apk meta coll.ensure_index([(MongoUtil.get_attr_str(RESOBJ_SCRIPT_META, RESOBJ_APK_META_PACKAGE_NAME, gridfs = False), -1)]) coll.ensure_index([(MongoUtil.get_attr_str(RESOBJ_SCRIPT_META, RESOBJ_APK_META_BUILD_DATE, gridfs = False), -1)]) # script meta coll.ensure_index([(MongoUtil.get_attr_str(RESOBJ_SCRIPT_META, RESOBJ_SCRIPT_META_ANALYSIS_DATE, gridfs = False), -1)]) coll.ensure_index([(MongoUtil.get_attr_str(RESOBJ_SCRIPT_META, RESOBJ_SCRIPT_META_NAME, gridfs = False), -1)]) # create indexes create_idx(self.res_coll) create_idx(self.files_coll) def _open_res_coll(self): ''' Create/open results collection. Raises ------ PyMongoError ''' res_coll = self.db[RESULT_DOCUMENTS_COLLECTION_NAME] # pymongo 3.0 removed the as_class option in the collection.find method # this is the fix if int(pymongo.version[0]) >= 3: from bson.codec_options import CodecOptions res_coll = res_coll.with_options(codec_options = CodecOptions(document_class = OrderedDict)) return res_coll def _open_gridfs(self): ''' Create/open gridfs. Raises ------ PyMongoError ''' return gridfs.GridFS(self.db, GRIDFS_COLLS_PREFIX) def __get_collection(self, gridfs_files_coll = False, gridfs_obj = False): ''' Get the right collection. If no parameter supplied or all False, the collection for normal document storage will be returned. Parameters are mutually exclusive! Parameters ---------- gridfs_files_coll : bool, optional (default is False) If you need to access the gridfs fils collection. Returns a normal collection, no `GridFS` object! gridfs_obj : bool, optional (default is False) Get the GridFS object. Returns ------- gridfs.GridFS If `gridfs_obj` pymongo.collection.Collection Otherwise ''' if gridfs_files_coll: return self.get_files_coll() if gridfs_obj: return self.grid_fs return self.get_res_coll()
[docs] def get_ids(self, non_document = False, where = None): ''' Get the id's for the results filtered by `where`. Parameters ---------- non_document : bool, optional (default is False) If True, use gridfs. where : dict, optional (default is None) Dictionary doing the filtering. If not given, get all ids. Returns ------ list<str> ''' id_gen = self.get_results(where = where, remove_id_field = False, non_document = non_document, include_fields = [RESOBJ_ID]) # get id out of dict return [id_dict[RESOBJ_ID] for id_dict in id_gen]
[docs] def get_all_ids(self, where = None): ''' Get all id's filtered by `where`. Parameters ---------- where : dict, optional (default is None) Dictionary doing the filtering. If not given, get all ids. Returns ------ list<str> ''' return self.get_ids(non_document = True, where = where) + self.get_ids(non_document = False, where = where)
def __recreate_collections(self, gridfs = False, res_collection = False): ''' Drop and recreate collections. Parameters ---------- gridfs : bool, optional (default is False) Recreate gridfs. res_collection, bool, optional (default is False) Recreate results collection. ''' try: if gridfs: log.debug("dropping collection %s", GRIDFS_COLLS_PREFIX) log.debug("dropping collection %s", FILES_COLL_NAME) self.db.drop_collection(FILES_COLL_NAME) log.debug("dropping collection %s", CHUNKS_COLL_NAME) self.db.drop_collection(CHUNKS_COLL_NAME) log.debug("recreating collection %s", GRIDFS_COLLS_PREFIX) self._open_gridfs() self._create_idx_for_colls() except PyMongoError as e: log.critical(e) try: if res_collection: log.debug("dropping collection %s", RESULT_DOCUMENTS_COLLECTION_NAME) self.db.drop_collection(RESULT_DOCUMENTS_COLLECTION_NAME) self._open_res_coll() log.debug("recreating collection %s", RESULT_DOCUMENTS_COLLECTION_NAME) except PyMongoError as e: log.critical(e)
[docs]def factory_from_config(settings): ''' Get a factory_from_config object from the distributed config. Parameters ---------- settings : Settings ''' from androlyze.celery.celerysettings import settings return ResultDatabaseStorage(*settings.get_mongodb_settings())
if __name__ == '__main__': from androlyze.model.script.ScriptUtil import dict2json print "foo" res_db = ResultDatabaseStorage() _if = ["script meta", "apk meta", "apkinfo.libraries"] _if = None ef = ["apkinfo"] wheres = [("apk meta.package name", "a2dp.Vol")] version = "0.1" for res in res_db.get_results(include_fields = _if, exclude_fields = ef, wheres = wheres, n = 2, version=version): print dict2json(res) print res_db