From a6deba300b03172280ad863fc0a1d5865ae0a560 Mon Sep 17 00:00:00 2001 From: danieleperera Date: Tue, 7 Jul 2020 18:22:44 +0000 Subject: [PATCH] fixed db workflow --- onioningestor/__init__.py | 12 ++-- onioningestor/dbhandler.py | 27 +++++++-- onioningestor/operators/__init__.py | 21 ++++--- onioningestor/operators/html.py | 31 ++++++++-- onioningestor/sources/simplefile.py | 7 ++- onioningestor/sources/torch.py | 90 +++++++++++++++++++++++++++++ 6 files changed, 156 insertions(+), 32 deletions(-) create mode 100644 onioningestor/sources/torch.py diff --git a/onioningestor/__init__.py b/onioningestor/__init__.py index a5e655d..fbee85b 100644 --- a/onioningestor/__init__.py +++ b/onioningestor/__init__.py @@ -73,23 +73,19 @@ class Ingestor: # Run the source to collect artifacts. self.logger.info(f"Running source '{source}'") try: + # get the generator of onions onions = self.sources[source].run() - if onions: - self.logger.info(f'Found hidden links') - else: - self.logger.info('No links found') except Exception as e: self.logger.error(e) self.logger.error(traceback.print_exc()) continue - # Process artifacts with each operator. + # Process onions with each operator. for operator in self.operators: self.logger.info(f"Processing found onions with operator '{operator}'") try: - doc = self.operators[operator].process(onions) - # Save the source state. - self.es.save(doc) + self.operators[operator].process(onions) + # Save the source onion with collected data except Exception as e: self.logger.error(e) self.logger.error(traceback.print_exc()) diff --git a/onioningestor/dbhandler.py b/onioningestor/dbhandler.py index 3de21e2..d4cb213 100644 --- a/onioningestor/dbhandler.py +++ b/onioningestor/dbhandler.py @@ -20,7 +20,8 @@ class DbHandlerElasticSearch: "type": "keyword" }, "monitor": { - "type": "boolean" + "type": "boolean", + "null_value": "false" }, "simple-html": { "type": "nested", @@ -39,6 +40,9 @@ class DbHandlerElasticSearch: }, "date-indexed": { "type": "date" + }, + "interestingKeywords":{ + "type": "keyword" } } } @@ -65,11 +69,22 @@ class DbHandlerElasticSearch: self.es.indices.refresh(self.index) status = self.es.count(index=self.index) if status['_shards']['successful'] == 1: - self.logger.info('Successful') - self.logger.info('Count:%d',status['count']) + self.logger.info('Successful Indexed Item on Elasticsearch') + self.logger.info('Current Items Count:%d',status['count']) else: self.logger.error(status) - def save(self, doc): - self.es.index(index=self.index,body=doc) - self.count() + def update(self, _id, data): + if _id and data: + self.es.update( + index=self.index, + id=_id, + body={"doc":data}) + self.count() + + def save(self, data): + if data: + status = self.es.index(index=self.index,body=data) + self.count() + return status + diff --git a/onioningestor/operators/__init__.py b/onioningestor/operators/__init__.py index 5c3ed5c..061dcdb 100644 --- a/onioningestor/operators/__init__.py +++ b/onioningestor/operators/__init__.py @@ -1,7 +1,7 @@ import re import sys import json - +from datetime import datetime as dt class Operator: """Base class for all Operator plugins. @@ -51,16 +51,13 @@ class Operator: return: dict """ try: - return {operator_name: json.loads(str(content)), 'hiddenService': onion} - except json.decoder.JSONDecodeError as e: - self.logger.info('JosnDecode Error') return {operator_name: content, 'hiddenService': onion} #except TypeError: # return {operator_name: None, 'hiddenService': onion} except Exception as e: self.logger.error(e) - def handle_onion(self, url): + def handle_onion(self, db, url): """Override with the same signature. :param artifact: A single ``Artifact`` object. @@ -69,18 +66,17 @@ class Operator: raise NotImplementedError() - def _onion_is_allowed(self, response, type='URL'): + def _onion_is_allowed(self, response, db, type='URL'): """Returns True if this is allowed by this plugin's filters.""" # Must be in allowed_sources, if set. if type == 'URL': - print(response) blacklist = self.blacklist.findall(response['hiddenService']) elif type == 'HTML': response['simple-html'].pop('status') response['simple-html']['status'] = 'blocked' blacklist = self.blacklist.findall(response['simple-html']['HTML']) if blacklist: - self.es.save(response) + self.es.update(db['_id'], response) return False return True @@ -88,8 +84,15 @@ class Operator: def process(self, onions): """Process all applicable onions.""" for onion in onions: + # Add link to database + db = self.es.save({ + 'hiddenService':onion.url, + 'monitor':'false', + 'dateAdded':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z'}) if self._onion_is_allowed( self.response({'status':'blocked'},onion.url,'regex-blacklist'), + db, type='URL'): - self.handle_onion(onion.url) + # Get data for current link + self.handle_onion(db, onion.url) diff --git a/onioningestor/operators/html.py b/onioningestor/operators/html.py index 8b6d415..d0a2064 100644 --- a/onioningestor/operators/html.py +++ b/onioningestor/operators/html.py @@ -1,3 +1,4 @@ +import re import time import json import traceback @@ -29,6 +30,9 @@ class Plugin(Operator): self.timeout = int(kwargs['timeout']) self.retries = int(kwargs['retries']) + interesting = kwargs['interestingKeywords'].split(',') + self.interesting = re.compile('|'.join([re.escape(word) for word in interesting]), re.IGNORECASE) + self.proxy = kwargs['socks5'] self.torControl = kwargs['TorController'] self.headers ={ @@ -74,7 +78,24 @@ class Plugin(Operator): result = content.text if result: html = BeautifulSoup(result,features="lxml") - index = {'HTML':result,'title':html.title.text,'language':detect(html.text),'date-crawled':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z','status':'success'} + if html: + index = { + 'HTML':result, + 'title':html.title.text, + 'language':detect(html.text), + 'date-crawled':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z', + 'status':'success', + 'interestingKeywords':list(set(self.interesting.findall(result))) + } + else: + index = { + 'HTML':result, + 'title': None, + 'language': None, + 'date-crawled':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z', + 'status':'success', + 'interestingKeywords':list(set(self.interesting.findall(result))) + } return self.response(index, onion, self.plugin_name) except requests.exceptions.ConnectionError as connection_error: self.logger.error(f'Failed connecting to http://{url}') @@ -90,10 +111,8 @@ class Plugin(Operator): self.logger.error('[x] Max retries exceeded') return self.response({'status':"failure"}, onion, self.plugin_name) - def handle_onion(self, onion): + def handle_onion(self, db, onion): content = self.run_sessions(onion) - print(content) if content[self.plugin_name]['status'] == 'success': - if self._onion_is_allowed(content): - self.es.save(content) - + if self._onion_is_allowed(db, content): + self.es.update(db['_id'], content) diff --git a/onioningestor/sources/simplefile.py b/onioningestor/sources/simplefile.py index b5656d7..f679aeb 100644 --- a/onioningestor/sources/simplefile.py +++ b/onioningestor/sources/simplefile.py @@ -1,10 +1,10 @@ #!/usr/bin/python3 # -*- coding: utf-8 -*- -__author__ = 'Andrey Glauzer' +__author__ = 'Daniele Perera' __license__ = "MIT" -__version__ = "1.0.1" -__maintainer__ = "Andrey Glauzer" +__version__ = "1.0.0" +__maintainer__ = "Daniele Perera" __status__ = "Development" import requests @@ -28,4 +28,5 @@ class Plugin(Source): lines = fp.read().splitlines() for onion in lines: yield self.onion(url=onion,source='simple-file',type='domain') + os.remove(self.filename) diff --git a/onioningestor/sources/torch.py b/onioningestor/sources/torch.py new file mode 100644 index 0000000..abdfef1 --- /dev/null +++ b/onioningestor/sources/torch.py @@ -0,0 +1,90 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +__author__ = 'Andrey Glauzer' +__license__ = "MIT" +__version__ = "1.0.1" +__maintainer__ = "Andrey Glauzer" +__status__ = "Development" + +import requests +import json +import re +import logging +import re +import urllib.parse +from random import choice +import time +from bs4 import BeautifulSoup + + +class TORCH: + def __init__(self, + port_proxy=None, + type_proxy=None, + server_proxy=None, + terms=None, + timeout=None): + self.desktop_agents = [ + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0' + ] + self.url = 'http://xmh57jrzrnw6insl.onion' + self.logger = logging.getLogger('Class:TORCH') + self.session = requests.session() + self.terms = terms + self.timeout = timeout + self.proxies = { + "http": f"{type_proxy}://{server_proxy}:{port_proxy}", + } + # Seleciona um agent aleatório de acordo com a lista. + + @property + def random_headers(self): + return { + 'User-Agent': choice(self.desktop_agents), + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + } + + @property + def start(self): + self.headers = self.random_headers + self.logger.info(f'Conectando em {self.url}') + + urls = [] + self.logger.info('Gerando URLS') + for term in self.terms: + urls.append( + f"{self.url}/4a1f6b371c/search.cgi?cmd=Search!&fmt=url&form=extended&GroupBySite=no&m=all&ps=50&q={term}&sp=1&sy=1&type=&ul=&wf=2221&wm=wrd") + cont = 0 + while cont <= 9: + cont += 1 + urls.append( + f"{self.url}/4a1f6b371c/search.cgi?cmd=Search!&fmt=url&form=extended&GroupBySite=no&m=all&np={cont}&ps=50&q={term}&sp=1&sy=1&type=&ul=&wf=2221&wm=wrd") + onionurls = [] + for url in urls: + self.logger.debug(f'Conectando em {url}') + try: + request = self.session.get( + url, proxies=self.proxies, timeout=self.timeout) + + if request.status_code == 200: + soup = BeautifulSoup(request.content, features="lxml") + for findurl in soup.find_all('dt'): + onionurls.append(findurl.find('a')['href'].replace('\xad', '') + .replace('\n', '') + .replace("http://", '') + .replace("https://", '') + .replace(r'\s', '') + .replace('\t', '')) + except(requests.exceptions.ConnectionError, + requests.exceptions.ChunkedEncodingError, + requests.exceptions.ReadTimeout, + requests.exceptions.InvalidURL) as e: + self.logger.error( + f'Não consegui conectar na url, porque ocorreu um erro.\n{e}') + pass + return onionurls + +if __name__ == '__main__': + app = Reddit() + app.start