You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
OnionIngestor/onioningestor/sources/reddit.py

121 lines
4.3 KiB
Python

#!/usr/bin/python3
# -*- coding: utf-8 -*-
__author__ = 'Andrey Glauzer'
__license__ = "MIT"
__version__ = "1.0.1"
__maintainer__ = "Andrey Glauzer"
__status__ = "Development"
import requests
import json
import re
import logging
import re
import urllib.parse
from random import choice
from bs4 import BeautifulSoup
class Reddit:
def __init__(self):
self.session = requests.session()
self.source = 'Reddit'
self.url = 'https://api.pushshift.io/reddit/search/comment/?subreddit=onions&limit=1000000'
self.desktop_agents = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0']
@property
def random_headers(self):
return {
'User-Agent': choice(self.desktop_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
}
@property
def start(self):
self.reddit_json()
def reddit_json(self):
print('Getting Reddit API information')
onionurl = []
try:
request = self.session.get(self.url, headers=self.random_headers)
loaded_json = json.loads(request.content)
print(
'Filtering the URLs that have the word .onion in the text')
for data in loaded_json['data']:
reddit_url = 'https://www.reddit.com{}'.format(
data['permalink'])
try:
request = self.session.get(
reddit_url, headers=self.random_headers)
soup = BeautifulSoup(request.content, features="lxml")
for raw in soup.findAll('a', {'rel': 'nofollow'}):
if 'https://' in raw['href']:
raw_text = self.raw(url=raw['href'])
if raw_text is not None:
print(
'Applying REGEX. Wait...')
regex = re.compile(
"[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion")
for lines in raw_text.split('\n'):
rurls = lines \
.replace('\xad', '') \
.replace('\n', '') \
.replace("http://", '') \
.replace("https://", '') \
.replace(r'\s', '') \
.replace('\t', '')
xurl = regex.match(rurls)
if xurl is not None:
onionurl.append(xurl.group())
except(requests.exceptions.ConnectionError,
requests.exceptions.ChunkedEncodingError,
requests.exceptions.ReadTimeout,
requests.exceptions.InvalidURL) as e:
print(
'Não consegui conectar na url, porque ocorreu um erro.\n{e}'.format(e=e))
except(requests.exceptions.ConnectionError,
requests.exceptions.ChunkedEncodingError,
requests.exceptions.ReadTimeout,
requests.exceptions.InvalidURL) as e:
print(
'Não consegui conectar na url, porque ocorreu um erro.\n{e}'.format(e=e))
return onionurl
def raw(self, url):
try:
if url is not None:
request = self.session.get(url, headers=self.random_headers)
print(
'Connecting in {url} - {status}'.format(url=url, status=request.status_code))
if request.status_code == 200:
soup = BeautifulSoup(request.content, features="lxml")
for s in soup(['script', 'style']):
s.decompose()
return ' '.join(soup.stripped_strings)
except (requests.exceptions.ConnectionError,
requests.exceptions.ChunkedEncodingError,
requests.exceptions.ReadTimeout,
requests.exceptions.TooManyRedirects) as e:
pass
if __name__ == '__main__':
app = Reddit()
app.start