diff --git a/onioningestor/operators/__init__.py b/onioningestor/operators/__init__.py index 061dcdb..7bb083a 100644 --- a/onioningestor/operators/__init__.py +++ b/onioningestor/operators/__init__.py @@ -1,7 +1,10 @@ import re import sys import json +from itertools import islice from datetime import datetime as dt +from concurrent.futures import ThreadPoolExecutor + class Operator: """Base class for all Operator plugins. @@ -72,18 +75,18 @@ class Operator: if type == 'URL': blacklist = self.blacklist.findall(response['hiddenService']) elif type == 'HTML': - response['simple-html'].pop('status') - response['simple-html']['status'] = 'blocked' blacklist = self.blacklist.findall(response['simple-html']['HTML']) if blacklist: + response['simple-html'].pop('status') + response['simple-html']['status'] = 'blocked' + response['blacklist'] = list(set(blacklist)) self.es.update(db['_id'], response) return False return True - - def process(self, onions): - """Process all applicable onions.""" + def collect(self, onions): for onion in onions: + self.logger.info(f'thread function processing {onion}') # Add link to database db = self.es.save({ 'hiddenService':onion.url, @@ -96,3 +99,18 @@ class Operator: # Get data for current link self.handle_onion(db, onion.url) + def iter_batches(self, data, batch_size): + data = iter(data) + while True: + batch = list(islice(data, batch_size)) + if len(batch) == 0: + break + yield batch + + def process(self, onions): + """Process all applicable onions.""" + #print(onions) + with ThreadPoolExecutor(max_workers=10) as executor: + collect_tasks = [executor.submit(self.collect, files_batch) for files_batch in self.iter_batches(onions, batch_size=10)] + for tasks in collect_tasks: + self.logger.info(tasks.result()) diff --git a/onioningestor/operators/html.py b/onioningestor/operators/html.py index d0a2064..8ccf821 100644 --- a/onioningestor/operators/html.py +++ b/onioningestor/operators/html.py @@ -78,6 +78,10 @@ class Plugin(Operator): result = content.text if result: html = BeautifulSoup(result,features="lxml") + # testing hardcorded filepath + with open("/home/tony/Projects/OnionScraper_v2/onion_master_list.txt", "w") as fp: + for onion in re.findall('([a-z2-7]{16,56}\.onion)',result): + fp.write("%s\n" % onion) if html: index = { 'HTML':result, @@ -114,5 +118,5 @@ class Plugin(Operator): def handle_onion(self, db, onion): content = self.run_sessions(onion) if content[self.plugin_name]['status'] == 'success': - if self._onion_is_allowed(db, content): + if self._onion_is_allowed(content, db, 'HTML'): self.es.update(db['_id'], content) diff --git a/onioningestor/sources/simplefile.py b/onioningestor/sources/simplefile.py index f679aeb..64c084f 100644 --- a/onioningestor/sources/simplefile.py +++ b/onioningestor/sources/simplefile.py @@ -7,6 +7,7 @@ __version__ = "1.0.0" __maintainer__ = "Daniele Perera" __status__ = "Development" +import os import requests from pathlib import Path @@ -23,10 +24,14 @@ class Plugin(Source): def run(self): + items = [] filepath = Path(__file__).parents[2]/self.filename with open(filepath, 'r') as fp: lines = fp.read().splitlines() - for onion in lines: - yield self.onion(url=onion,source='simple-file',type='domain') + # just testing os.remove(self.filename) + for onion in lines: + items.append(self.onion(url=onion,source='simple-file',type='domain')) + #yield self.onion(url=onion,source='simple-file',type='domain') + return items