diff --git a/onioningestor/__init__.py b/onioningestor/__init__.py index 1d9565b..8437f68 100644 --- a/onioningestor/__init__.py +++ b/onioningestor/__init__.py @@ -61,7 +61,7 @@ class Ingestor: # Instantiate operator plugins. self.logger.debug("initializing operators") - self.operators = {name: operator(self.logger, **kwargs) + self.operators = {name: operator(self.logger, self.blacklist, **kwargs) for name, operator, kwargs in self.config.operators()} except Exception as e: @@ -99,6 +99,8 @@ class Ingestor: ## Process onions with each operator. for operator in self.operators: self.logger.info(f"Processing found onions with operator '{operator}'") + # Set CrawlQueue for every operator + self.operators[operator].set_crawlQueue(self.queue) # Process list of onions self.operators[operator].process(onions) done = True diff --git a/onioningestor/databases/elasticsearch.py b/onioningestor/databases/elasticsearch.py index 152de86..3675065 100644 --- a/onioningestor/databases/elasticsearch.py +++ b/onioningestor/databases/elasticsearch.py @@ -61,7 +61,7 @@ class Plugin(PastieStorage): 'port':self.config['port']}]) self.es.indices.create( index=self.index, - body=self.mapping, + #body=self.mapping, ignore=400) except Exception as e: self.logger.error(e) diff --git a/onioningestor/operators/__init__.py b/onioningestor/operators/__init__.py index 5514f40..d154e67 100644 --- a/onioningestor/operators/__init__.py +++ b/onioningestor/operators/__init__.py @@ -6,8 +6,7 @@ from itertools import islice from datetime import datetime as dt from concurrent.futures import ThreadPoolExecutor -from collections import namedtuple - +from onioningestor.onion import Onion class Operator: """Base class for all Operator plugins. @@ -47,12 +46,13 @@ class Operator: classes. Remember to do so *before* setting any default artifact_types. """ self.logger = logger - self.processQueue = Queue() - self.onions = {} - self.onion = namedtuple('onion',['url','source','type','index','monitor','denylist']) + self.onion = Onion deny = allowed_sources or [] self.blacklist = re.compile('|'.join([re.escape(word) for word in deny]), re.IGNORECASE) + def set_crawlQueue(self, queue): + self.queueCrawl = queue + def handle_onion(self, url): """Override with the same signature. @@ -78,6 +78,7 @@ class Operator: blacklist = self.blacklist.findall(content) if blacklist: onion.denylist = blacklist + onion.status = 'blocked' def collect(self, onions): for onion in onions: diff --git a/onioningestor/operators/html.py b/onioningestor/operators/html.py index 174dd60..fd84b47 100644 --- a/onioningestor/operators/html.py +++ b/onioningestor/operators/html.py @@ -23,8 +23,8 @@ class Plugin(Operator): This plugin collects HTML code from onion link """ - def __init__(self, logger, **kwargs): - super(Plugin, self).__init__(logger) + def __init__(self, logger, denylist, **kwargs): + super(Plugin, self).__init__(logger, denylist) self.name = kwargs['name'] self.logger.info(f"Initializing {self.name}") @@ -90,7 +90,7 @@ class Plugin(Operator): "title": html.title.text, "language": detect(html.text), "status": "success", - "interestingKeywords": Counter(self.interesting.findall(result)), + "interestingKeywords": list(set(self.interesting.findall(result))), } else: index = { @@ -98,7 +98,7 @@ class Plugin(Operator): "title": None, "language": None, "status": "success", - "interestingKeywords": Counter(self.interesting.findall(result)), + "interestingKeywords": list(set(self.interesting.findall(result))), } return self.response("success", index) diff --git a/onioningestor/operators/onionscan.py b/onioningestor/operators/onionscan.py index 7e65029..342c874 100644 --- a/onioningestor/operators/onionscan.py +++ b/onioningestor/operators/onionscan.py @@ -17,7 +17,8 @@ class Plugin(Operator): Handles reading the config file, calling sources, maintaining state and sending artifacts to operators. """ - def __init__(self, logger, **kwargs): + def __init__(self, logger, denylist, **kwargs): + super(Plugin, self).__init__(logger, denylist) self.name = kwargs['name'] self.logger = logger self.logger.info(f'Initializing {self.name}') @@ -27,16 +28,20 @@ class Plugin(Operator): self.torControl = "Zue5a29v4xE6FciWpPF93rR2M2T" def parseDoc(self, data): - data['onionscan'].pop('simpleReport', None) - crawls = data['onionscan'].pop('crawls', None) - hiddenService = data['onionscan'].pop('hiddenService', None) - data['onionscan']['crawls'] = [*crawls] - data['hiddenService'] = hiddenService - for onion in crawls.keys(): - self.queueCrawl(( + data.pop('simpleReport', None) + crawls = data.pop('crawls', None) + hiddenService = data.pop('hiddenService', None) + data['crawls'] = [*crawls] + crawl = set() + for onion in re.findall(r'\s?(\w+.onion)', str(crawls.keys())): + if onion != hiddenService: + crawl.add(onion) + for items in crawl: + print(f'crawling queue added: {item}') + self.queueCrawl.put(( 3, self.onion( - url=onion, + url=item, source='crawled', type='domain', status='offline', @@ -97,6 +102,7 @@ class Plugin(Operator): self.parseDoc(stdout)) self.logger.info("[!!!] Process timed out for %s", onion) + print(stdout) return self.response("failed",stdout) def handle_onion(self, onion):