fixed crawlqueue issue

pull/6/head
danieleperera 4 years ago
parent e2b7c8346d
commit 57cdd72226

@ -61,7 +61,7 @@ class Ingestor:
# Instantiate operator plugins.
self.logger.debug("initializing operators")
self.operators = {name: operator(self.logger, **kwargs)
self.operators = {name: operator(self.logger, self.blacklist, **kwargs)
for name, operator, kwargs in self.config.operators()}
except Exception as e:
@ -99,6 +99,8 @@ class Ingestor:
## Process onions with each operator.
for operator in self.operators:
self.logger.info(f"Processing found onions with operator '{operator}'")
# Set CrawlQueue for every operator
self.operators[operator].set_crawlQueue(self.queue)
# Process list of onions
self.operators[operator].process(onions)
done = True

@ -61,7 +61,7 @@ class Plugin(PastieStorage):
'port':self.config['port']}])
self.es.indices.create(
index=self.index,
body=self.mapping,
#body=self.mapping,
ignore=400)
except Exception as e:
self.logger.error(e)

@ -6,8 +6,7 @@ from itertools import islice
from datetime import datetime as dt
from concurrent.futures import ThreadPoolExecutor
from collections import namedtuple
from onioningestor.onion import Onion
class Operator:
"""Base class for all Operator plugins.
@ -47,12 +46,13 @@ class Operator:
classes. Remember to do so *before* setting any default artifact_types.
"""
self.logger = logger
self.processQueue = Queue()
self.onions = {}
self.onion = namedtuple('onion',['url','source','type','index','monitor','denylist'])
self.onion = Onion
deny = allowed_sources or []
self.blacklist = re.compile('|'.join([re.escape(word) for word in deny]), re.IGNORECASE)
def set_crawlQueue(self, queue):
self.queueCrawl = queue
def handle_onion(self, url):
"""Override with the same signature.
@ -78,6 +78,7 @@ class Operator:
blacklist = self.blacklist.findall(content)
if blacklist:
onion.denylist = blacklist
onion.status = 'blocked'
def collect(self, onions):
for onion in onions:

@ -23,8 +23,8 @@ class Plugin(Operator):
This plugin collects HTML code from onion link
"""
def __init__(self, logger, **kwargs):
super(Plugin, self).__init__(logger)
def __init__(self, logger, denylist, **kwargs):
super(Plugin, self).__init__(logger, denylist)
self.name = kwargs['name']
self.logger.info(f"Initializing {self.name}")
@ -90,7 +90,7 @@ class Plugin(Operator):
"title": html.title.text,
"language": detect(html.text),
"status": "success",
"interestingKeywords": Counter(self.interesting.findall(result)),
"interestingKeywords": list(set(self.interesting.findall(result))),
}
else:
index = {
@ -98,7 +98,7 @@ class Plugin(Operator):
"title": None,
"language": None,
"status": "success",
"interestingKeywords": Counter(self.interesting.findall(result)),
"interestingKeywords": list(set(self.interesting.findall(result))),
}
return self.response("success", index)

@ -17,7 +17,8 @@ class Plugin(Operator):
Handles reading the config file, calling sources, maintaining state and
sending artifacts to operators.
"""
def __init__(self, logger, **kwargs):
def __init__(self, logger, denylist, **kwargs):
super(Plugin, self).__init__(logger, denylist)
self.name = kwargs['name']
self.logger = logger
self.logger.info(f'Initializing {self.name}')
@ -27,16 +28,20 @@ class Plugin(Operator):
self.torControl = "Zue5a29v4xE6FciWpPF93rR2M2T"
def parseDoc(self, data):
data['onionscan'].pop('simpleReport', None)
crawls = data['onionscan'].pop('crawls', None)
hiddenService = data['onionscan'].pop('hiddenService', None)
data['onionscan']['crawls'] = [*crawls]
data['hiddenService'] = hiddenService
for onion in crawls.keys():
self.queueCrawl((
data.pop('simpleReport', None)
crawls = data.pop('crawls', None)
hiddenService = data.pop('hiddenService', None)
data['crawls'] = [*crawls]
crawl = set()
for onion in re.findall(r'\s?(\w+.onion)', str(crawls.keys())):
if onion != hiddenService:
crawl.add(onion)
for items in crawl:
print(f'crawling queue added: {item}')
self.queueCrawl.put((
3,
self.onion(
url=onion,
url=item,
source='crawled',
type='domain',
status='offline',
@ -97,6 +102,7 @@ class Plugin(Operator):
self.parseDoc(stdout))
self.logger.info("[!!!] Process timed out for %s", onion)
print(stdout)
return self.response("failed",stdout)
def handle_onion(self, onion):

Loading…
Cancel
Save