fixed db workflow

4 years ago · a6deba300b
parent e31f149af0
commit a6deba300b
6 changed files with 156 additions and 32 deletions
--- a/onioningestor/init.py
+++ b/onioningestor/init.py
@ -73,23 +73,19 @@ class Ingestor:
            # Run the source to collect artifacts.
            self.logger.info(f"Running source '{source}'")
            try:
+                # get the generator of onions
                onions = self.sources[source].run()
-                if onions:
-                    self.logger.info(f'Found hidden links')
-                else:
-                    self.logger.info('No links found')
            except Exception as e:
                self.logger.error(e)
                self.logger.error(traceback.print_exc())
                continue

-            # Process artifacts with each operator.
+            # Process onions with each operator.
            for operator in self.operators:
                self.logger.info(f"Processing found onions with operator '{operator}'")
                try:
-                    doc = self.operators[operator].process(onions)
-                    # Save the source state.
-                    self.es.save(doc)
+                    self.operators[operator].process(onions)
+                    # Save the source onion with collected data
                except Exception as e:
                    self.logger.error(e)
                    self.logger.error(traceback.print_exc())
--- a/onioningestor/dbhandler.py
+++ b/onioningestor/dbhandler.py
@ -20,7 +20,8 @@ class DbHandlerElasticSearch:
                  "type": "keyword"
                },      
                "monitor": {
-                  "type": "boolean"
+                  "type": "boolean",
+                  "null_value": "false"
                },
                "simple-html": {
                  "type": "nested",
@ -39,6 +40,9 @@ class DbHandlerElasticSearch:
                    },
                    "date-indexed": {
                      "type": "date"
+                    },
+                    "interestingKeywords":{
+                      "type": "keyword"
                    }
                  }
                }
@ -65,11 +69,22 @@ class DbHandlerElasticSearch:
        self.es.indices.refresh(self.index)
        status = self.es.count(index=self.index)
        if status['_shards']['successful'] == 1:
-            self.logger.info('Successful')
-            self.logger.info('Count:%d',status['count'])
+            self.logger.info('Successful Indexed Item on Elasticsearch')
+            self.logger.info('Current Items Count:%d',status['count'])
        else:
            self.logger.error(status)

-    def save(self, doc):
-        self.es.index(index=self.index,body=doc)
-        self.count()
+    def update(self, _id, data):
+        if _id and data:
+            self.es.update(
+                    index=self.index,
+                    id=_id,
+                    body={"doc":data})
+            self.count()
+
+    def save(self, data):
+        if data:
+            status = self.es.index(index=self.index,body=data)
+            self.count()
+            return status
+
--- a/onioningestor/operators/init.py
+++ b/onioningestor/operators/init.py
@ -1,7 +1,7 @@
 import re
 import sys
 import json
-
+from datetime import datetime as dt

 class Operator:
    """Base class for all Operator plugins.
@ -51,16 +51,13 @@ class Operator:
        return: dict
        """
        try:
-            return {operator_name: json.loads(str(content)), 'hiddenService': onion}
-        except json.decoder.JSONDecodeError as e:
-            self.logger.info('JosnDecode Error')
            return {operator_name: content, 'hiddenService': onion}
        #except TypeError:
        #    return {operator_name: None, 'hiddenService': onion}
        except Exception as e:
            self.logger.error(e)

-    def handle_onion(self, url):
+    def handle_onion(self, db, url):
        """Override with the same signature.

        :param artifact: A single ``Artifact`` object.
@ -69,18 +66,17 @@ class Operator:
        raise NotImplementedError()


-    def _onion_is_allowed(self, response, type='URL'):
+    def _onion_is_allowed(self, response, db, type='URL'):
        """Returns True if this is allowed by this plugin's filters."""
        # Must be in allowed_sources, if set.
        if type == 'URL':
-            print(response)
            blacklist = self.blacklist.findall(response['hiddenService'])
        elif type == 'HTML':
            response['simple-html'].pop('status')
            response['simple-html']['status'] = 'blocked'
            blacklist = self.blacklist.findall(response['simple-html']['HTML'])
        if blacklist:
-            self.es.save(response)
+            self.es.update(db['_id'], response)
            return False
        return True

@ -88,8 +84,15 @@ class Operator:
    def process(self, onions):
        """Process all applicable onions."""
        for onion in onions:
+            # Add link to database 
+            db = self.es.save({
+                'hiddenService':onion.url,
+                'monitor':'false',
+                'dateAdded':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z'})
            if self._onion_is_allowed(
                    self.response({'status':'blocked'},onion.url,'regex-blacklist'),
+                    db,
                    type='URL'):
-                self.handle_onion(onion.url)
+                # Get data for current link
+                self.handle_onion(db, onion.url)

--- a/onioningestor/operators/html.py
+++ b/onioningestor/operators/html.py
@ -1,3 +1,4 @@
+import re
 import time
 import json
 import traceback
@ -29,6 +30,9 @@ class Plugin(Operator):
        self.timeout = int(kwargs['timeout'])
        self.retries = int(kwargs['retries'])

+        interesting = kwargs['interestingKeywords'].split(',')
+        self.interesting = re.compile('|'.join([re.escape(word) for word in interesting]), re.IGNORECASE)
+        
        self.proxy = kwargs['socks5']
        self.torControl = kwargs['TorController']
        self.headers ={
@ -74,7 +78,24 @@ class Plugin(Operator):
                        result = content.text
                        if result:
                            html = BeautifulSoup(result,features="lxml")
-                            index = {'HTML':result,'title':html.title.text,'language':detect(html.text),'date-crawled':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z','status':'success'}
+                            if html:
+                                index = {
+                                        'HTML':result,
+                                        'title':html.title.text,
+                                        'language':detect(html.text),
+                                        'date-crawled':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z',
+                                        'status':'success',
+                                        'interestingKeywords':list(set(self.interesting.findall(result)))
+                                        }
+                            else:
+                                index = {
+                                        'HTML':result,
+                                        'title': None,
+                                        'language': None,
+                                        'date-crawled':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z',
+                                        'status':'success',
+                                        'interestingKeywords':list(set(self.interesting.findall(result)))
+                                        }
                            return self.response(index, onion, self.plugin_name)
                except requests.exceptions.ConnectionError as connection_error:
                    self.logger.error(f'Failed connecting to http://{url}')
@ -90,10 +111,8 @@ class Plugin(Operator):
                    self.logger.error('[x] Max retries exceeded')
                    return self.response({'status':"failure"}, onion, self.plugin_name)

-    def handle_onion(self, onion):
+    def handle_onion(self, db, onion):
        content = self.run_sessions(onion)
-        print(content)
        if content[self.plugin_name]['status'] == 'success':
-            if self._onion_is_allowed(content):
-                self.es.save(content)
-
+            if self._onion_is_allowed(db, content):
+                self.es.update(db['_id'], content)
--- a/onioningestor/sources/simplefile.py
+++ b/onioningestor/sources/simplefile.py
@ -1,10 +1,10 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-

-__author__ = 'Andrey Glauzer'
+__author__ = 'Daniele Perera'
 __license__ = "MIT"
-__version__ = "1.0.1"
-__maintainer__ = "Andrey Glauzer"
+__version__ = "1.0.0"
+__maintainer__ = "Daniele Perera"
 __status__ = "Development"

 import requests
@ -28,4 +28,5 @@ class Plugin(Source):
            lines = fp.read().splitlines()
        for onion in lines:
            yield self.onion(url=onion,source='simple-file',type='domain')
+        os.remove(self.filename)

--- a/onioningestor/sources/torch.py
+++ b/onioningestor/sources/torch.py
@ -0,0 +1,90 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+__author__ = 'Andrey Glauzer'
+__license__ = "MIT"
+__version__ = "1.0.1"
+__maintainer__ = "Andrey Glauzer"
+__status__ = "Development"
+
+import requests
+import json
+import re
+import logging
+import re
+import urllib.parse
+from random import choice
+import time
+from bs4 import BeautifulSoup
+
+
+class TORCH:
+    def __init__(self,
+                 port_proxy=None,
+                 type_proxy=None,
+                 server_proxy=None,
+                 terms=None,
+                 timeout=None):
+        self.desktop_agents = [
+            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0'
+        ]
+        self.url = 'http://xmh57jrzrnw6insl.onion'
+        self.logger = logging.getLogger('Class:TORCH')
+        self.session = requests.session()
+        self.terms = terms
+        self.timeout = timeout
+        self.proxies = {
+            "http": f"{type_proxy}://{server_proxy}:{port_proxy}",
+        }
+        # Seleciona um agent aleatório de acordo com a lista.
+
+    @property
+    def random_headers(self):
+        return {
+            'User-Agent': choice(self.desktop_agents),
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
+        }
+
+    @property
+    def start(self):
+        self.headers = self.random_headers
+        self.logger.info(f'Conectando em {self.url}')
+
+        urls = []
+        self.logger.info('Gerando URLS')
+        for term in self.terms:
+            urls.append(
+                f"{self.url}/4a1f6b371c/search.cgi?cmd=Search!&fmt=url&form=extended&GroupBySite=no&m=all&ps=50&q={term}&sp=1&sy=1&type=&ul=&wf=2221&wm=wrd")
+            cont = 0
+            while cont <= 9:
+                cont += 1
+                urls.append(
+                    f"{self.url}/4a1f6b371c/search.cgi?cmd=Search!&fmt=url&form=extended&GroupBySite=no&m=all&np={cont}&ps=50&q={term}&sp=1&sy=1&type=&ul=&wf=2221&wm=wrd")
+        onionurls = []
+        for url in urls:
+            self.logger.debug(f'Conectando em {url}')
+            try:
+                request = self.session.get(
+                    url, proxies=self.proxies, timeout=self.timeout)
+
+                if request.status_code == 200:
+                    soup = BeautifulSoup(request.content, features="lxml")
+                    for findurl in soup.find_all('dt'):
+                        onionurls.append(findurl.find('a')['href'].replace('\xad', '')
+                                                .replace('\n', '')
+                                                .replace("http://", '')
+                                                .replace("https://", '')
+                                                .replace(r'\s', '')
+                                                .replace('\t', ''))
+            except(requests.exceptions.ConnectionError,
+                   requests.exceptions.ChunkedEncodingError,
+                   requests.exceptions.ReadTimeout,
+                   requests.exceptions.InvalidURL) as e:
+                self.logger.error(
+                    f'Não consegui conectar na url, porque ocorreu um erro.\n{e}')
+                pass
+        return onionurls
+
+if __name__ == '__main__':
+    app = Reddit()
+    app.start