fixed crawlqueue issue

pull/6/head
danieleperera 4 years ago
parent e2b7c8346d
commit 57cdd72226

@ -61,7 +61,7 @@ class Ingestor:
# Instantiate operator plugins. # Instantiate operator plugins.
self.logger.debug("initializing operators") self.logger.debug("initializing operators")
self.operators = {name: operator(self.logger, **kwargs) self.operators = {name: operator(self.logger, self.blacklist, **kwargs)
for name, operator, kwargs in self.config.operators()} for name, operator, kwargs in self.config.operators()}
except Exception as e: except Exception as e:
@ -99,6 +99,8 @@ class Ingestor:
## Process onions with each operator. ## Process onions with each operator.
for operator in self.operators: for operator in self.operators:
self.logger.info(f"Processing found onions with operator '{operator}'") self.logger.info(f"Processing found onions with operator '{operator}'")
# Set CrawlQueue for every operator
self.operators[operator].set_crawlQueue(self.queue)
# Process list of onions # Process list of onions
self.operators[operator].process(onions) self.operators[operator].process(onions)
done = True done = True

@ -61,7 +61,7 @@ class Plugin(PastieStorage):
'port':self.config['port']}]) 'port':self.config['port']}])
self.es.indices.create( self.es.indices.create(
index=self.index, index=self.index,
body=self.mapping, #body=self.mapping,
ignore=400) ignore=400)
except Exception as e: except Exception as e:
self.logger.error(e) self.logger.error(e)

@ -6,8 +6,7 @@ from itertools import islice
from datetime import datetime as dt from datetime import datetime as dt
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from collections import namedtuple from onioningestor.onion import Onion
class Operator: class Operator:
"""Base class for all Operator plugins. """Base class for all Operator plugins.
@ -47,12 +46,13 @@ class Operator:
classes. Remember to do so *before* setting any default artifact_types. classes. Remember to do so *before* setting any default artifact_types.
""" """
self.logger = logger self.logger = logger
self.processQueue = Queue() self.onion = Onion
self.onions = {}
self.onion = namedtuple('onion',['url','source','type','index','monitor','denylist'])
deny = allowed_sources or [] deny = allowed_sources or []
self.blacklist = re.compile('|'.join([re.escape(word) for word in deny]), re.IGNORECASE) self.blacklist = re.compile('|'.join([re.escape(word) for word in deny]), re.IGNORECASE)
def set_crawlQueue(self, queue):
self.queueCrawl = queue
def handle_onion(self, url): def handle_onion(self, url):
"""Override with the same signature. """Override with the same signature.
@ -78,6 +78,7 @@ class Operator:
blacklist = self.blacklist.findall(content) blacklist = self.blacklist.findall(content)
if blacklist: if blacklist:
onion.denylist = blacklist onion.denylist = blacklist
onion.status = 'blocked'
def collect(self, onions): def collect(self, onions):
for onion in onions: for onion in onions:

@ -23,8 +23,8 @@ class Plugin(Operator):
This plugin collects HTML code from onion link This plugin collects HTML code from onion link
""" """
def __init__(self, logger, **kwargs): def __init__(self, logger, denylist, **kwargs):
super(Plugin, self).__init__(logger) super(Plugin, self).__init__(logger, denylist)
self.name = kwargs['name'] self.name = kwargs['name']
self.logger.info(f"Initializing {self.name}") self.logger.info(f"Initializing {self.name}")
@ -90,7 +90,7 @@ class Plugin(Operator):
"title": html.title.text, "title": html.title.text,
"language": detect(html.text), "language": detect(html.text),
"status": "success", "status": "success",
"interestingKeywords": Counter(self.interesting.findall(result)), "interestingKeywords": list(set(self.interesting.findall(result))),
} }
else: else:
index = { index = {
@ -98,7 +98,7 @@ class Plugin(Operator):
"title": None, "title": None,
"language": None, "language": None,
"status": "success", "status": "success",
"interestingKeywords": Counter(self.interesting.findall(result)), "interestingKeywords": list(set(self.interesting.findall(result))),
} }
return self.response("success", index) return self.response("success", index)

@ -17,7 +17,8 @@ class Plugin(Operator):
Handles reading the config file, calling sources, maintaining state and Handles reading the config file, calling sources, maintaining state and
sending artifacts to operators. sending artifacts to operators.
""" """
def __init__(self, logger, **kwargs): def __init__(self, logger, denylist, **kwargs):
super(Plugin, self).__init__(logger, denylist)
self.name = kwargs['name'] self.name = kwargs['name']
self.logger = logger self.logger = logger
self.logger.info(f'Initializing {self.name}') self.logger.info(f'Initializing {self.name}')
@ -27,16 +28,20 @@ class Plugin(Operator):
self.torControl = "Zue5a29v4xE6FciWpPF93rR2M2T" self.torControl = "Zue5a29v4xE6FciWpPF93rR2M2T"
def parseDoc(self, data): def parseDoc(self, data):
data['onionscan'].pop('simpleReport', None) data.pop('simpleReport', None)
crawls = data['onionscan'].pop('crawls', None) crawls = data.pop('crawls', None)
hiddenService = data['onionscan'].pop('hiddenService', None) hiddenService = data.pop('hiddenService', None)
data['onionscan']['crawls'] = [*crawls] data['crawls'] = [*crawls]
data['hiddenService'] = hiddenService crawl = set()
for onion in crawls.keys(): for onion in re.findall(r'\s?(\w+.onion)', str(crawls.keys())):
self.queueCrawl(( if onion != hiddenService:
crawl.add(onion)
for items in crawl:
print(f'crawling queue added: {item}')
self.queueCrawl.put((
3, 3,
self.onion( self.onion(
url=onion, url=item,
source='crawled', source='crawled',
type='domain', type='domain',
status='offline', status='offline',
@ -97,6 +102,7 @@ class Plugin(Operator):
self.parseDoc(stdout)) self.parseDoc(stdout))
self.logger.info("[!!!] Process timed out for %s", onion) self.logger.info("[!!!] Process timed out for %s", onion)
print(stdout)
return self.response("failed",stdout) return self.response("failed",stdout)
def handle_onion(self, onion): def handle_onion(self, onion):

Loading…
Cancel
Save