fixed indexing issue

pull/6/head
danieleperera 4 years ago
parent 57cdd72226
commit c752c09e25

@ -5,6 +5,7 @@ import traceback
import threading import threading
import collections import collections
from queue import Queue from queue import Queue
from itertools import islice
from . import config from . import config
from . import loghandler from . import loghandler
@ -61,7 +62,7 @@ class Ingestor:
# Instantiate operator plugins. # Instantiate operator plugins.
self.logger.debug("initializing operators") self.logger.debug("initializing operators")
self.operators = {name: operator(self.logger, self.blacklist, **kwargs) self.operators = {name: operator(self.logger, self.config.torController(), self.blacklist, **kwargs)
for name, operator, kwargs in self.config.operators()} for name, operator, kwargs in self.config.operators()}
except Exception as e: except Exception as e:
@ -70,6 +71,21 @@ class Ingestor:
self.logger.debug(traceback.print_exc()) self.logger.debug(traceback.print_exc())
sys.exit(1) sys.exit(1)
def iter_batches(self, data, batch_size):
data = iter(data)
while True:
batch = list(islice(data, batch_size))
if len(batch) == 0:
break
yield batch
def process(self, onions):
for operator in self.operators:
self.logger.info(f"Processing found onions with operator '{operator}'")
# Set CrawlQueue for every operator
self.operators[operator].set_crawlQueue(self.queue)
# Process list of onions
self.operators[operator].process(onions)
def run(self): def run(self):
"""Run once, or forever, depending on config.""" """Run once, or forever, depending on config."""
@ -86,7 +102,6 @@ class Ingestor:
"""Run each source once, passing artifacts to each operator.""" """Run each source once, passing artifacts to each operator."""
# Start collecting sources # Start collecting sources
self.collect_sources() self.collect_sources()
# Sources will fill various queues # Sources will fill various queues
# MonitorQueue has priority high # MonitorQueue has priority high
# OnionQueue are those found in clearnet medium # OnionQueue are those found in clearnet medium
@ -97,16 +112,12 @@ class Ingestor:
while not done: while not done:
try: try:
## Process onions with each operator. ## Process onions with each operator.
for operator in self.operators: for batched_onions in self.iter_batches(onions, batch_size=10):
self.logger.info(f"Processing found onions with operator '{operator}'") self.process(batched_onions)
# Set CrawlQueue for every operator ## Save Onions for each storage
self.operators[operator].set_crawlQueue(self.queue) for onion in batched_onions:
# Process list of onions self.storage.save_pastie(onion[1], 30)
self.operators[operator].process(onions) done = True
done = True
## Save Onions for each storage
for onion in onions:
self.storage.save_pastie(onion[1], 30)
except Exception as e: except Exception as e:
self.logger.error(e) self.logger.error(e)
self.logger.error(traceback.print_exc()) self.logger.error(traceback.print_exc())

@ -56,6 +56,9 @@ class Config:
def blacklist(self): def blacklist(self):
return self.config["general"]["blacklist"].split(",") return self.config["general"]["blacklist"].split(",")
def torController(self):
return self.config["general"]["TorController"]
def monitorQueue(self): def monitorQueue(self):
fp = self.config["monitor"].get("filename", False) fp = self.config["monitor"].get("filename", False)
q = PriorityQueue(maxsize=0) q = PriorityQueue(maxsize=0)

@ -116,24 +116,8 @@ class PastieStorage():
self.logger.debug('{0}: pastie[{1}] saved in {2}s'.format(self.name, pastie.url, delta)) self.logger.debug('{0}: pastie[{1}] saved in {2}s'.format(self.name, pastie.url, delta))
except Exception as e: except Exception as e:
self.logger.error('{0}: unable to save pastie[{1}]: {2}'.format(self.name, pastie.url, e)) self.logger.error('{0}: unable to save pastie[{1}]: {2}'.format(self.name, pastie.url, e))
raise pass
#raise
def __seen_pastie__(self, pastie_id, **kwargs):
raise NotImplementedError
def seen_pastie(self, pastie_id, **kwargs):
if not self.lookup:
return False
try:
start = time.time()
self.logger.debug('{0}: looking up pastie[{1}]'.format(self.name, pastie_id))
res = self.__seen_pastie__(pastie_id, **kwargs)
delta = time.time() - start
self.logger.debug('{0}: pastie[{1}] looked-up in {2}s'.format(self.name, pastie_id, delta))
return res
except Exception as e:
self.logger.error('{0}: unable to lookup pastie[{1}]: {2}'.format(self.name, pastie_id, e))
raise
class Notifier(object): class Notifier(object):
def __init__(self, logger, **kwargs): def __init__(self, logger, **kwargs):

@ -61,7 +61,6 @@ class Plugin(PastieStorage):
'port':self.config['port']}]) 'port':self.config['port']}])
self.es.indices.create( self.es.indices.create(
index=self.index, index=self.index,
#body=self.mapping,
ignore=400) ignore=400)
except Exception as e: except Exception as e:
self.logger.error(e) self.logger.error(e)

@ -14,10 +14,11 @@ class Plugin(PastieStorage):
def __save_pastie__(self, pastie): def __save_pastie__(self, pastie):
message = ''' message = '''
HiddenSite: {site} HiddenSite
{site}
Source : {url} Source : {url}
Monitor : {content} Monitor : {content}
Status : {status} Status : {status}
'''.format( '''.format(
site=pastie.url, site=pastie.url,
url=pastie.source, url=pastie.source,

@ -10,13 +10,10 @@ class Onion(object):
self.monitor = monitor self.monitor = monitor
self.denylist = denylist self.denylist = denylist
self.datetime = dt.now() self.datetime = dt.now()
self.operators = {}
def simpleHTML(self, response): def set_operator(self, response):
self.simpleHTML = response self.operators.update(response)
# if any match update denylist
def onionscan(self, response):
self.onionscan = response
def asdict(self): def asdict(self):
d = { d = {
@ -27,8 +24,7 @@ class Onion(object):
'monitor': self.monitor, 'monitor': self.monitor,
'denylist': self.denylist, 'denylist': self.denylist,
'dateFound': self.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")+"Z", 'dateFound': self.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")+"Z",
'simpleHTML': self.simpleHTML, 'operators': self.operators,
'onionscan':self.onionscan
} }
return d return d

@ -1,11 +1,15 @@
import re import re
import sys import sys
import json import json
import time
import requests
from queue import Queue from queue import Queue
from itertools import islice
from datetime import datetime as dt from datetime import datetime as dt
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from stem.control import Controller
from stem import Signal
from onioningestor.onion import Onion from onioningestor.onion import Onion
class Operator: class Operator:
@ -22,7 +26,7 @@ class Operator:
override other existing methods from this class. override other existing methods from this class.
""" """
def __init__(self, logger, allowed_sources=None): def __init__(self, logger, config, allowed_sources=None):
"""Override this constructor in child classes. """Override this constructor in child classes.
The arguments above (artifact_types, filter_string, allowed_sources) The arguments above (artifact_types, filter_string, allowed_sources)
@ -47,9 +51,23 @@ class Operator:
""" """
self.logger = logger self.logger = logger
self.onion = Onion self.onion = Onion
self.torControl = config
deny = allowed_sources or [] deny = allowed_sources or []
self.blacklist = re.compile('|'.join([re.escape(word) for word in deny]), re.IGNORECASE) self.blacklist = re.compile('|'.join([re.escape(word) for word in deny]), re.IGNORECASE)
# signal TOR for a new connection
def renew_connection(self):
with Controller.from_port(port = int(self.torControl['port'])) as controller:
# Now we switch TOR identities to make sure we have a good connection
self.logger.info('Getting new Tor IP')
# authenticate to our local TOR controller
controller.authenticate(self.torControl['password'])
# send the signal for a new identity
controller.signal(Signal.NEWNYM)
# wait for the new identity to be initialized
time.sleep(controller.get_newnym_wait())
self.logger.info(f"IP is {requests.get('http://httpbin.org/ip').json()['origin']}")
def set_crawlQueue(self, queue): def set_crawlQueue(self, queue):
self.queueCrawl = queue self.queueCrawl = queue
@ -61,7 +79,7 @@ class Operator:
""" """
raise NotImplementedError() raise NotImplementedError()
def response(self, status, content): def response(self, operator, status, content):
""" """
status: success/failure status: success/failure
content: dict content: dict
@ -69,7 +87,7 @@ class Operator:
return: dict return: dict
""" """
try: try:
return {'status':status, 'content': content} return {operator:{'status':status, 'content': content}}
except Exception as e: except Exception as e:
self.logger.error(e) self.logger.error(e)
@ -80,29 +98,33 @@ class Operator:
onion.denylist = blacklist onion.denylist = blacklist
onion.status = 'blocked' onion.status = 'blocked'
def findCrawls(self, content, hiddenService):
crawl = set()
for onion in re.findall(r'\s?(\w+.onion)', str(content)):
if onion != hiddenService:
crawl.add(onion)
for item in crawl:
print(f'crawling queue added: {item}')
self.queueCrawl.put((
3,
self.onion(
url=item,
source='crawled',
type='domain',
status='offline',
monitor=False,
denylist=False)))
def collect(self, onions): def collect(self, onions):
for onion in onions: for onion in onions:
self.logger.info(f'thread function processing {onion}') self.logger.info(f'thread function processing {onion[1]}')
if onion.monitor: self.handle_onion(onion[1])
self.handle_onion(onion)
else:
if self._onion_is_allowed(onion.url):
self.handle_onion(onion)
def iter_batches(self, data, batch_size):
data = iter(data)
while True:
batch = list(islice(data, batch_size))
if len(batch) == 0:
break
yield batch
def process(self, onions): def process(self, onions):
"""Process all applicable onions.""" """Process all applicable onions."""
for onion in onions: for onion in onions:
self.handle_onion(onion[1]) self.handle_onion(onion[1])
#self.save_pastie()
#with ThreadPoolExecutor(max_workers=1) as executor: #with ThreadPoolExecutor(max_workers=1) as executor:
# collect_tasks = [executor.submit(self.collect, files_batch) for files_batch in self.iter_batches(onions, batch_size=10)] # collect_tasks = [executor.submit(self.collect, files_batch) for files_batch in self.iter_batches(onions, batch_size=10)]
# for tasks in collect_tasks: # for tasks in collect_tasks:

@ -23,8 +23,8 @@ class Plugin(Operator):
This plugin collects HTML code from onion link This plugin collects HTML code from onion link
""" """
def __init__(self, logger, denylist, **kwargs): def __init__(self, logger, denylist, config, **kwargs):
super(Plugin, self).__init__(logger, denylist) super(Plugin, self).__init__(logger, denylist, config)
self.name = kwargs['name'] self.name = kwargs['name']
self.logger.info(f"Initializing {self.name}") self.logger.info(f"Initializing {self.name}")
@ -37,7 +37,6 @@ class Plugin(Operator):
) )
self.proxy = kwargs["socks5"] self.proxy = kwargs["socks5"]
self.torControl = kwargs["TorController"]
self.headers = { self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
@ -57,33 +56,21 @@ class Plugin(Operator):
self.logger.debug(traceback.print_exc()) self.logger.debug(traceback.print_exc())
return s return s
def renew_connection(self):
with Controller.from_port(port=self.torControl["port"]) as controller:
# Now we switch TOR identities to make sure we have a good connection
self.logger.info("Getting new Tor IP")
# authenticate to our local TOR controller
controller.authenticate(self.torControl["password"])
# send the signal for a new identity
controller.signal(Signal.NEWNYM)
# wait for the new identity to be initialized
time.sleep(controller.get_newnym_wait())
session = self.get_tor_session()
self.logger.info(
f"IP is {session.get('http://httpbin.org/ip').json()['origin']}"
)
def run_sessions(self, onion): def run_sessions(self, onion):
retry = 0 retry = 0
result = None result = None
while True: while True:
try: try:
url = "http://" + onion url = "http://" + onion.url
self.logger.info(url) self.logger.info(url)
content = self.get_tor_session().get(url) content = self.get_tor_session().get(url)
if content.status_code == 200: if content.status_code == 200:
result = content.text result = content.text
if result: if result:
html = BeautifulSoup(result, features="lxml") html = BeautifulSoup(result, features="lxml")
## Find other onion links
self.findCrawls(html, onion.url)
if html: if html:
index = { index = {
"HTML": result, "HTML": result,
@ -100,7 +87,8 @@ class Plugin(Operator):
"status": "success", "status": "success",
"interestingKeywords": list(set(self.interesting.findall(result))), "interestingKeywords": list(set(self.interesting.findall(result))),
} }
return self.response("success", index) onion.status = 'online'
return self.response(self.name,"success", index)
except requests.exceptions.ConnectionError as connection_error: except requests.exceptions.ConnectionError as connection_error:
self.logger.error(f"Failed connecting to http://{url}") self.logger.error(f"Failed connecting to http://{url}")
@ -114,10 +102,11 @@ class Plugin(Operator):
self.renew_connection() self.renew_connection()
if retry > self.retries: if retry > self.retries:
self.logger.error("[x] Max retries exceeded") self.logger.error("[x] Max retries exceeded")
return self.response("failed", None) return self.response(self.name,"failed", None)
def handle_onion(self, onion): def handle_onion(self, onion):
html = self.run_sessions(onion.url) html = self.run_sessions(onion)
if html['status'] == 'success': response = html[self.name]
self._onion_is_allowed(html['content']['HTML'], onion) if response['status'] == 'success':
onion.simpleHTML(html) self._onion_is_allowed(response['content']['HTML'], onion)
onion.set_operator(html)

@ -1,5 +1,4 @@
import json import json
import time
import traceback import traceback
import subprocess import subprocess
from threading import Timer from threading import Timer
@ -8,6 +7,10 @@ from concurrent.futures import ProcessPoolExecutor
import requests import requests
from stem.control import Controller
from stem import Signal
from onioningestor.operators import Operator from onioningestor.operators import Operator
@ -17,50 +20,26 @@ class Plugin(Operator):
Handles reading the config file, calling sources, maintaining state and Handles reading the config file, calling sources, maintaining state and
sending artifacts to operators. sending artifacts to operators.
""" """
def __init__(self, logger, denylist, **kwargs): def __init__(self, logger, denylist, config, **kwargs):
super(Plugin, self).__init__(logger, denylist) super(Plugin, self).__init__(logger, denylist, config)
self.name = kwargs['name'] self.name = kwargs['name']
self.logger = logger self.logger = logger
self.logger.info(f'Initializing {self.name}') self.logger.info(f'Initializing {self.name}')
self.onionscan = kwargs['binpath'] self.onionscan = kwargs['binpath']
self.timeout = int(kwargs.get('timeout', 300)) self.timeout = int(kwargs.get('timeout', 300))
self.torControl = 9051
self.torControl = "Zue5a29v4xE6FciWpPF93rR2M2T"
def parseDoc(self, data): def parseDoc(self, data):
data.pop('simpleReport', None) data.pop('simpleReport', None)
crawls = data.pop('crawls', None) crawls = data.pop('crawls', None)
hiddenService = data.pop('hiddenService', None) hiddenService = data.pop('hiddenService', None)
data['crawls'] = [*crawls] data['crawls'] = [*crawls]
crawl = set() try:
for onion in re.findall(r'\s?(\w+.onion)', str(crawls.keys())): if data['linkedOnions']:
if onion != hiddenService: self.findCrawls(data['linkedOnions'], hiddenService)
crawl.add(onion) except KeyError as e:
for items in crawl: pass
print(f'crawling queue added: {item}')
self.queueCrawl.put((
3,
self.onion(
url=item,
source='crawled',
type='domain',
status='offline',
monitor=False,
denylist=False)))
return data return data
# signal TOR for a new connection
def renew_connection(self):
with Controller.from_port(port = self.torControl['port']) as controller:
# Now we switch TOR identities to make sure we have a good connection
self.logger.info('Getting new Tor IP')
# authenticate to our local TOR controller
controller.authenticate(self.torControl['password'])
# send the signal for a new identity
controller.signal(Signal.NEWNYM)
# wait for the new identity to be initialized
time.sleep(controller.get_newnym_wait())
self.logger.info(f"IP is {requests.get('http://httpbin.org/ip').json()['origin']}")
def handle_timeout(self, process, onion): def handle_timeout(self, process, onion):
# #
@ -74,7 +53,6 @@ class Plugin(Operator):
except: except:
pass pass
self.renew_connection() self.renew_connection()
return
def run_onionscan(self, onion): def run_onionscan(self, onion):
self.logger.info("[*] Running onionscan on %s", onion) self.logger.info("[*] Running onionscan on %s", onion)
@ -94,21 +72,23 @@ class Plugin(Operator):
process_timer.cancel() process_timer.cancel()
try: try:
return self.response( return self.response(
self.name,
"success", "success",
self.parseDoc(json.loads(stdout))) self.parseDoc(json.loads(stdout)))
except json.decoder.JSONDecodeError: except json.decoder.JSONDecodeError:
return self.response( return self.response(
self.name,
"success", "success",
self.parseDoc(stdout)) self.parseDoc(stdout))
self.logger.info("[!!!] Process timed out for %s", onion) self.logger.info("[!!!] Process timed out for %s", onion)
print(stdout) return self.response(self.name,"failed", None)
return self.response("failed",stdout)
def handle_onion(self, onion): def handle_onion(self, onion):
try: try:
results = self.run_onionscan(onion.url) if onion.status != 'inactive':
onion.onionscan(results) results = self.run_onionscan(onion.url)
onion.set_operator(results)
except Exception as e: except Exception as e:
self.logger.error(e) self.logger.error(e)
self.logger.error(traceback.print_exc()) self.logger.error(traceback.print_exc())

Loading…
Cancel
Save