fixed db workflow

pull/1/head
danieleperera 4 years ago
parent e31f149af0
commit a6deba300b

@ -73,23 +73,19 @@ class Ingestor:
# Run the source to collect artifacts.
self.logger.info(f"Running source '{source}'")
try:
# get the generator of onions
onions = self.sources[source].run()
if onions:
self.logger.info(f'Found hidden links')
else:
self.logger.info('No links found')
except Exception as e:
self.logger.error(e)
self.logger.error(traceback.print_exc())
continue
# Process artifacts with each operator.
# Process onions with each operator.
for operator in self.operators:
self.logger.info(f"Processing found onions with operator '{operator}'")
try:
doc = self.operators[operator].process(onions)
# Save the source state.
self.es.save(doc)
self.operators[operator].process(onions)
# Save the source onion with collected data
except Exception as e:
self.logger.error(e)
self.logger.error(traceback.print_exc())

@ -20,7 +20,8 @@ class DbHandlerElasticSearch:
"type": "keyword"
},
"monitor": {
"type": "boolean"
"type": "boolean",
"null_value": "false"
},
"simple-html": {
"type": "nested",
@ -39,6 +40,9 @@ class DbHandlerElasticSearch:
},
"date-indexed": {
"type": "date"
},
"interestingKeywords":{
"type": "keyword"
}
}
}
@ -65,11 +69,22 @@ class DbHandlerElasticSearch:
self.es.indices.refresh(self.index)
status = self.es.count(index=self.index)
if status['_shards']['successful'] == 1:
self.logger.info('Successful')
self.logger.info('Count:%d',status['count'])
self.logger.info('Successful Indexed Item on Elasticsearch')
self.logger.info('Current Items Count:%d',status['count'])
else:
self.logger.error(status)
def save(self, doc):
self.es.index(index=self.index,body=doc)
self.count()
def update(self, _id, data):
if _id and data:
self.es.update(
index=self.index,
id=_id,
body={"doc":data})
self.count()
def save(self, data):
if data:
status = self.es.index(index=self.index,body=data)
self.count()
return status

@ -1,7 +1,7 @@
import re
import sys
import json
from datetime import datetime as dt
class Operator:
"""Base class for all Operator plugins.
@ -51,16 +51,13 @@ class Operator:
return: dict
"""
try:
return {operator_name: json.loads(str(content)), 'hiddenService': onion}
except json.decoder.JSONDecodeError as e:
self.logger.info('JosnDecode Error')
return {operator_name: content, 'hiddenService': onion}
#except TypeError:
# return {operator_name: None, 'hiddenService': onion}
except Exception as e:
self.logger.error(e)
def handle_onion(self, url):
def handle_onion(self, db, url):
"""Override with the same signature.
:param artifact: A single ``Artifact`` object.
@ -69,18 +66,17 @@ class Operator:
raise NotImplementedError()
def _onion_is_allowed(self, response, type='URL'):
def _onion_is_allowed(self, response, db, type='URL'):
"""Returns True if this is allowed by this plugin's filters."""
# Must be in allowed_sources, if set.
if type == 'URL':
print(response)
blacklist = self.blacklist.findall(response['hiddenService'])
elif type == 'HTML':
response['simple-html'].pop('status')
response['simple-html']['status'] = 'blocked'
blacklist = self.blacklist.findall(response['simple-html']['HTML'])
if blacklist:
self.es.save(response)
self.es.update(db['_id'], response)
return False
return True
@ -88,8 +84,15 @@ class Operator:
def process(self, onions):
"""Process all applicable onions."""
for onion in onions:
# Add link to database
db = self.es.save({
'hiddenService':onion.url,
'monitor':'false',
'dateAdded':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z'})
if self._onion_is_allowed(
self.response({'status':'blocked'},onion.url,'regex-blacklist'),
db,
type='URL'):
self.handle_onion(onion.url)
# Get data for current link
self.handle_onion(db, onion.url)

@ -1,3 +1,4 @@
import re
import time
import json
import traceback
@ -29,6 +30,9 @@ class Plugin(Operator):
self.timeout = int(kwargs['timeout'])
self.retries = int(kwargs['retries'])
interesting = kwargs['interestingKeywords'].split(',')
self.interesting = re.compile('|'.join([re.escape(word) for word in interesting]), re.IGNORECASE)
self.proxy = kwargs['socks5']
self.torControl = kwargs['TorController']
self.headers ={
@ -74,7 +78,24 @@ class Plugin(Operator):
result = content.text
if result:
html = BeautifulSoup(result,features="lxml")
index = {'HTML':result,'title':html.title.text,'language':detect(html.text),'date-crawled':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z','status':'success'}
if html:
index = {
'HTML':result,
'title':html.title.text,
'language':detect(html.text),
'date-crawled':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z',
'status':'success',
'interestingKeywords':list(set(self.interesting.findall(result)))
}
else:
index = {
'HTML':result,
'title': None,
'language': None,
'date-crawled':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z',
'status':'success',
'interestingKeywords':list(set(self.interesting.findall(result)))
}
return self.response(index, onion, self.plugin_name)
except requests.exceptions.ConnectionError as connection_error:
self.logger.error(f'Failed connecting to http://{url}')
@ -90,10 +111,8 @@ class Plugin(Operator):
self.logger.error('[x] Max retries exceeded')
return self.response({'status':"failure"}, onion, self.plugin_name)
def handle_onion(self, onion):
def handle_onion(self, db, onion):
content = self.run_sessions(onion)
print(content)
if content[self.plugin_name]['status'] == 'success':
if self._onion_is_allowed(content):
self.es.save(content)
if self._onion_is_allowed(db, content):
self.es.update(db['_id'], content)

@ -1,10 +1,10 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
__author__ = 'Andrey Glauzer'
__author__ = 'Daniele Perera'
__license__ = "MIT"
__version__ = "1.0.1"
__maintainer__ = "Andrey Glauzer"
__version__ = "1.0.0"
__maintainer__ = "Daniele Perera"
__status__ = "Development"
import requests
@ -28,4 +28,5 @@ class Plugin(Source):
lines = fp.read().splitlines()
for onion in lines:
yield self.onion(url=onion,source='simple-file',type='domain')
os.remove(self.filename)

@ -0,0 +1,90 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
__author__ = 'Andrey Glauzer'
__license__ = "MIT"
__version__ = "1.0.1"
__maintainer__ = "Andrey Glauzer"
__status__ = "Development"
import requests
import json
import re
import logging
import re
import urllib.parse
from random import choice
import time
from bs4 import BeautifulSoup
class TORCH:
def __init__(self,
port_proxy=None,
type_proxy=None,
server_proxy=None,
terms=None,
timeout=None):
self.desktop_agents = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0'
]
self.url = 'http://xmh57jrzrnw6insl.onion'
self.logger = logging.getLogger('Class:TORCH')
self.session = requests.session()
self.terms = terms
self.timeout = timeout
self.proxies = {
"http": f"{type_proxy}://{server_proxy}:{port_proxy}",
}
# Seleciona um agent aleatório de acordo com a lista.
@property
def random_headers(self):
return {
'User-Agent': choice(self.desktop_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
}
@property
def start(self):
self.headers = self.random_headers
self.logger.info(f'Conectando em {self.url}')
urls = []
self.logger.info('Gerando URLS')
for term in self.terms:
urls.append(
f"{self.url}/4a1f6b371c/search.cgi?cmd=Search!&fmt=url&form=extended&GroupBySite=no&m=all&ps=50&q={term}&sp=1&sy=1&type=&ul=&wf=2221&wm=wrd")
cont = 0
while cont <= 9:
cont += 1
urls.append(
f"{self.url}/4a1f6b371c/search.cgi?cmd=Search!&fmt=url&form=extended&GroupBySite=no&m=all&np={cont}&ps=50&q={term}&sp=1&sy=1&type=&ul=&wf=2221&wm=wrd")
onionurls = []
for url in urls:
self.logger.debug(f'Conectando em {url}')
try:
request = self.session.get(
url, proxies=self.proxies, timeout=self.timeout)
if request.status_code == 200:
soup = BeautifulSoup(request.content, features="lxml")
for findurl in soup.find_all('dt'):
onionurls.append(findurl.find('a')['href'].replace('\xad', '')
.replace('\n', '')
.replace("http://", '')
.replace("https://", '')
.replace(r'\s', '')
.replace('\t', ''))
except(requests.exceptions.ConnectionError,
requests.exceptions.ChunkedEncodingError,
requests.exceptions.ReadTimeout,
requests.exceptions.InvalidURL) as e:
self.logger.error(
f'Não consegui conectar na url, porque ocorreu um erro.\n{e}')
pass
return onionurls
if __name__ == '__main__':
app = Reddit()
app.start
Loading…
Cancel
Save