Browse Source

initial

master
spike 9 months ago
commit
c68c0e138f
18 changed files with 675 additions and 0 deletions
  1. +3
    -0
      .gitignore
  2. +20
    -0
      LICENSE.txt
  3. +23
    -0
      Pipfile
  4. +210
    -0
      Pipfile.lock
  5. +53
    -0
      README.md
  6. BIN
     
  7. BIN
     
  8. BIN
     
  9. +18
    -0
      database_connection.py
  10. +63
    -0
      download_images.py
  11. +80
    -0
      export_messages.py
  12. +86
    -0
      import_messages.py
  13. +23
    -0
      list_rooms.py
  14. +36
    -0
      matrix_connection.py
  15. +25
    -0
      schema.py
  16. +3
    -0
      setup.cfg
  17. +19
    -0
      templates/default.html.tpl
  18. +13
    -0
      templates/default.txt.tpl

+ 3
- 0
.gitignore View File

@ -0,0 +1,3 @@
thumbnails/
images/
messages.html

+ 20
- 0
LICENSE.txt View File

@ -0,0 +1,20 @@
MIT License
Copyright (c) 2018 Oliver Steele
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

+ 23
- 0
Pipfile View File

@ -0,0 +1,23 @@
[[source]]
name = "pypi"
url = "https://pypi.python.org/simple"
verify_ssl = true
[requires]
python_full_version = "3.6.5"
[scripts]
import = "python import_messages.py"
export = "python export_messages.py"
list = "python list_rooms.py"
[packages]
matrix_client = "*"
mongoengine = "*"
click = "*"
tabulate = "*"
pyyaml = "*"
"jinja2" = "*"
requests = "*"
[dev-packages]

+ 210
- 0
Pipfile.lock View File

@ -0,0 +1,210 @@
{
"_meta": {
"hash": {
"sha256": "cc3f8517f705a63c6a297c6c461a500fb75e28dc81050219498b261f1ca157b1"
},
"pipfile-spec": 6,
"requires": {
"python_full_version": "3.6.5"
},
"sources": [
{
"name": "pypi",
"url": "https://pypi.python.org/simple",
"verify_ssl": true
}
]
},
"default": {
"certifi": {
"hashes": [
"sha256:5ad7e9a056d25ffa5082862e36f119f7f7cec6457fa07ee2f8c339814b80c9b1",
"sha256:9cd41137dc19af6a5e03b630eefe7d1f458d964d406342dd3edf625839b944cc"
],
"version": "==2020.4.5.2"
},
"chardet": {
"hashes": [
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
],
"version": "==3.0.4"
},
"click": {
"hashes": [
"sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a",
"sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"
],
"index": "pypi",
"version": "==7.1.2"
},
"idna": {
"hashes": [
"sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb",
"sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa"
],
"version": "==2.9"
},
"jinja2": {
"hashes": [
"sha256:89aab215427ef59c34ad58735269eb58b1a5808103067f7bb9d5836c651b3bb0",
"sha256:f0a4641d3cf955324a89c04f3d94663aa4d638abe8f733ecd3582848e1c37035"
],
"index": "pypi",
"version": "==2.11.2"
},
"markupsafe": {
"hashes": [
"sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473",
"sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161",
"sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235",
"sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5",
"sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42",
"sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff",
"sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b",
"sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1",
"sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e",
"sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183",
"sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66",
"sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b",
"sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1",
"sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15",
"sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1",
"sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e",
"sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b",
"sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905",
"sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735",
"sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d",
"sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e",
"sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d",
"sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c",
"sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21",
"sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2",
"sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5",
"sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b",
"sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6",
"sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f",
"sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f",
"sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2",
"sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7",
"sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be"
],
"version": "==1.1.1"
},
"matrix-client": {
"hashes": [
"sha256:2855a2614a177db66f9bc3ba38cbd2876041456f663c334f72a160ab6bb11c49",
"sha256:dce3ccb8665df0d519f08e07a16e6d3f9fab3a947df4b7a7c4bb26573d68f2d5"
],
"index": "pypi",
"version": "==0.3.2"
},
"mongoengine": {
"hashes": [
"sha256:6e127f45f71c2bc5e72461ec297a0c20f04c3ee0bf6dd869e336226e325db6ef",
"sha256:db9e5d587e5d74e52851e0e4a53fd744725bfa9918ae6070139f5ba9c62c6edf"
],
"index": "pypi",
"version": "==0.20.0"
},
"pymongo": {
"hashes": [
"sha256:01b4e10027aef5bb9ecefbc26f5df3368ce34aef81df43850f701e716e3fe16d",
"sha256:0fc5aa1b1acf7f61af46fe0414e6a4d0c234b339db4c03a63da48599acf1cbfc",
"sha256:1396eb7151e0558b1f817e4b9d7697d5599e5c40d839a9f7270bd90af994ad82",
"sha256:18e84a3ec5e73adcb4187b8e5541b2ad61d716026ed9863267e650300d8bea33",
"sha256:19adf2848b80cb349b9891cc854581bbf24c338be9a3260e73159bdeb2264464",
"sha256:20ee0475aa2ba437b0a14806f125d696f90a8433d820fb558fdd6f052acde103",
"sha256:26798795097bdeb571f13942beef7e0b60125397811c75b7aa9214d89880dd1d",
"sha256:26e707a4eb851ec27bb969b5f1413b9b2eac28fe34271fa72329100317ea7c73",
"sha256:2a3c7ad01553b27ec553688a1e6445e7f40355fb37d925c11fcb50b504e367f8",
"sha256:2f07b27dbf303ea53f4147a7922ce91a26b34a0011131471d8aaf73151fdee9a",
"sha256:316f0cf543013d0c085e15a2c8abe0db70f93c9722c0f99b6f3318ff69477d70",
"sha256:31d11a600eea0c60de22c8bdcb58cda63c762891facdcb74248c36713240987f",
"sha256:334ef3ffd0df87ea83a0054454336159f8ad9c1b389e19c0032d9cb8410660e6",
"sha256:358ba4693c01022d507b96a980ded855a32dbdccc3c9331d0667be5e967f30ed",
"sha256:3a6568bc53103df260f5c7d2da36dffc5202b9a36c85540bba1836a774943794",
"sha256:444bf2f44264578c4085bb04493bfed0e5c1b4fe7c2704504d769f955cc78fe4",
"sha256:47a00b22c52ee59dffc2aad02d0bbfb20c26ec5b8de8900492bf13ad6901cf35",
"sha256:4c067db43b331fc709080d441cb2e157114fec60749667d12186cc3fc8e7a951",
"sha256:4c092310f804a5d45a1bcaa4191d6d016c457b6ed3982a622c35f729ff1c7f6b",
"sha256:53b711b33134e292ef8499835a3df10909c58df53a2a0308f598c432e9a62892",
"sha256:568d6bee70652d8a5af1cd3eec48b4ca1696fb1773b80719ebbd2925b72cb8f6",
"sha256:56fa55032782b7f8e0bf6956420d11e2d4e9860598dfe9c504edec53af0fc372",
"sha256:5a2c492680c61b440272341294172fa3b3751797b1ab983533a770e4fb0a67ac",
"sha256:61235cc39b5b2f593086d1d38f3fc130b2d125bd8fc8621d35bc5b6bdeb92bd2",
"sha256:619ac9aaf681434b4d4718d1b31aa2f0fce64f2b3f8435688fcbdc0c818b6c54",
"sha256:6238ac1f483494011abde5286282afdfacd8926659e222ba9b74c67008d3a58c",
"sha256:63752a72ca4d4e1386278bd43d14232f51718b409e7ac86bcf8810826b531113",
"sha256:6fdc5ccb43864065d40dd838437952e9e3da9821b7eac605ba46ada77f846bdf",
"sha256:7abc3a6825a346fa4621a6f63e3b662bbb9e0f6ffc32d30a459d695f20fb1a8b",
"sha256:7aef381bb9ae8a3821abd7f9d4d93978dbd99072b48522e181baeffcd95b56ae",
"sha256:80df3caf251fe61a3f0c9614adc6e2bfcffd1cd3345280896766712fb4b4d6d7",
"sha256:95f970f34b59987dee6f360d2e7d30e181d58957b85dff929eee4423739bd151",
"sha256:993257f6ca3cde55332af1f62af3e04ca89ce63c08b56a387cdd46136c72f2fa",
"sha256:9c0a57390549affc2b5dda24a38de03a5c7cbc58750cd161ff5d106c3c6eec80",
"sha256:a0794e987d55d2f719cc95fcf980fc62d12b80e287e6a761c4be14c60bd9fecc",
"sha256:a3b98121e68bf370dd8ea09df67e916f93ea95b52fc010902312168c4d1aff5d",
"sha256:a60756d55f0887023b3899e6c2923ba5f0042fb11b1d17810b4e07395404f33e",
"sha256:a676bd2fbc2309092b9bbb0083d35718b5420af3a42135ebb1e4c3633f56604d",
"sha256:a732838c78554c1257ff2492f5c8c4c7312d0aecd7f732149e255f3749edd5ee",
"sha256:ae65d65fde4135ef423a2608587c9ef585a3551fc2e4e431e7c7e527047581be",
"sha256:b070a4f064a9edb70f921bfdc270725cff7a78c22036dd37a767c51393fb956f",
"sha256:b6da85949aa91e9f8c521681344bd2e163de894a5492337fba8b05c409225a4f",
"sha256:bbf47110765b2a999803a7de457567389253f8670f7daafb98e059c899ce9764",
"sha256:c06b3f998d2d7160db58db69adfb807d2ec307e883e2f17f6b87a1ef6c723f11",
"sha256:c318fb70542be16d3d4063cde6010b1e4d328993a793529c15a619251f517c39",
"sha256:c4aef42e5fa4c9d5a99f751fb79caa880dac7eaf8a65121549318b984676a1b7",
"sha256:c9ca545e93a9c2a3bdaa2e6e21f7a43267ff0813e8055adf2b591c13164c0c57",
"sha256:da2c3220eb55c4239dd8b982e213da0b79023cac59fe54ca09365f2bc7e4ad32",
"sha256:dd8055da300535eefd446b30995c0813cc4394873c9509323762a93e97c04c03",
"sha256:e2b46e092ea54b732d98c476720386ff2ccd126de1e52076b470b117bff7e409",
"sha256:e334c4f39a2863a239d38b5829e442a87f241a92da9941861ee6ec5d6380b7fe",
"sha256:e5c54f04ca42bbb5153aec5d4f2e3d9f81e316945220ac318abd4083308143f5",
"sha256:f96333f9d2517c752c20a35ff95de5fc2763ac8cdb1653df0f6f45d281620606"
],
"version": "==3.10.1"
},
"pyyaml": {
"hashes": [
"sha256:06a0d7ba600ce0b2d2fe2e78453a470b5a6e000a985dd4a4e54e436cc36b0e97",
"sha256:240097ff019d7c70a4922b6869d8a86407758333f02203e0fc6ff79c5dcede76",
"sha256:4f4b913ca1a7319b33cfb1369e91e50354d6f07a135f3b901aca02aa95940bd2",
"sha256:69f00dca373f240f842b2931fb2c7e14ddbacd1397d57157a9b005a6a9942648",
"sha256:73f099454b799e05e5ab51423c7bcf361c58d3206fa7b0d555426b1f4d9a3eaf",
"sha256:74809a57b329d6cc0fdccee6318f44b9b8649961fa73144a98735b0aaf029f1f",
"sha256:7739fc0fa8205b3ee8808aea45e968bc90082c10aef6ea95e855e10abf4a37b2",
"sha256:95f71d2af0ff4227885f7a6605c37fd53d3a106fcab511b8860ecca9fcf400ee",
"sha256:b8eac752c5e14d3eca0e6dd9199cd627518cb5ec06add0de9d32baeee6fe645d",
"sha256:cc8955cfbfc7a115fa81d85284ee61147059a753344bc51098f3ccd69b0d7e0c",
"sha256:d13155f591e6fcc1ec3b30685d50bf0711574e2c0dfffd7644babf8b5102ca1a"
],
"index": "pypi",
"version": "==5.3.1"
},
"requests": {
"hashes": [
"sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee",
"sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6"
],
"index": "pypi",
"version": "==2.23.0"
},
"tabulate": {
"hashes": [
"sha256:ac64cb76d53b1231d364babcd72abbb16855adac7de6665122f97b593f1eb2ba",
"sha256:db2723a20d04bcda8522165c73eea7c300eda74e0ce852d9022e0159d7895007"
],
"index": "pypi",
"version": "==0.8.7"
},
"urllib3": {
"hashes": [
"sha256:3018294ebefce6572a474f0604c2021e33b3fd8006ecd11d62107a5d2a963527",
"sha256:88206b0eb87e6d677d424843ac5209e3fb9d0190d0ee169599165ec25e9d9115"
],
"version": "==1.25.9"
}
},
"develop": {}
}

+ 53
- 0
README.md View File

@ -0,0 +1,53 @@
# Matrix Archive Tools
Import messages from a matrix.org room, for research, archival, and
preservation.
Developed at [Dinacon 2018](https://www.dinacon.org), for use by the
documentation team.
Use this responsibly and ethically. Don't re-publish people's messages
without their knowledge and consent.
## Setup
Install Pipenv. Run `pipenv install`.
Set these environment variables: `MATRIX_USER`, `MATRIX_PASSWORD`,
`MATRIX_ROOM_IDS`.
`MATRIX_ROOM_IDS` should be a comma-separated list of Matrix room IDs (or a
single id). Run `pipenv run list_rooms.py` to list the room ids.
Set `MONGODB_URI` to a MongoDB connection URL, *or* install a local MongoDB
instance.
## Usage
### Import Messages
`pipenv run import` imports the messages into the database.
### Export Messages
`pipenv run export filename.html` exports a text, HTML, JSON, or YAML file,
depending on the name of `filename.html`. The file contains links to the image
download URLs on the Matrix server.
### Download Images
`pipenv run download_images.py` downloads all the thumbnail images in the
database into a download directory (default `thumbnails`), skipping images that
have already been downloaded.
Use the `--no-thumbnails` option to download full size images instead of
thumbnails. In this case, the default directory is `images` instead of
`thumbnails`.
## References
[Matrix Client-Server API](https://matrix.org/docs/spec/r0.0.0/client_server.html)
## License
MIT

BIN
View File


BIN
View File


BIN
View File


+ 18
- 0
database_connection.py View File

@ -0,0 +1,18 @@
import os
import re
from mongoengine import connect
MONGODB_URI = os.getenv('MONGODB_URI')
MONGO_RE = (r'mongodb://'
r'(?P<username>.+?)'
r':(?P<password>.+?)'
r'@(?P<host>(?:.+?):(?:\d+))'
r'/(?P<db>.+)')
if MONGODB_URI:
print(f"Connecting to {MONGODB_URI}")
connect_args = re.match(MONGO_RE, MONGODB_URI).groupdict()
connect(**connect_args)
else:
connect('matrix')

+ 63
- 0
download_images.py View File

@ -0,0 +1,63 @@
from pathlib import Path
from urllib.parse import urlparse
import click
import requests
import database_connection # noqa: F401
from matrix_connection import get_download_url
from schema import Message
def download_stem(message, prefer_thumbnails):
image_url = (message.thumbnail_url if prefer_thumbnails else None) \
or message.image_url
return urlparse(image_url).path.lstrip('/')
def run_downloads(messages, download_dir, prefer_thumbnails):
for msg in messages:
image_url = (msg.thumbnail_url if prefer_thumbnails else None) or msg.image_url
res = requests.head(get_download_url(image_url))
assert res.status_code == 200
mtype, subtype = res.headers['content-type'].split('/', 2)
if mtype != 'image':
print(f"Skipping {image_url}: {res.headers['content-type']}")
continue
res = requests.get(get_download_url(image_url))
assert res.status_code == 200
filename = (download_dir / download_stem(msg, prefer_thumbnails)
).with_suffix('.' + subtype)
print('Downloading', image_url, '->', filename)
with open(filename, 'wb') as fp:
fp.write(res.content)
@click.command()
@click.option('--thumbnails/--no-thumbnails', default=True)
@click.argument('output', required=False)
def download_images(thumbnails, output):
"""Download thumbnails."""
noun = 'thumbnails' if thumbnails else 'images'
download_dir = Path(output or noun)
messages = [msg for msg in Message.objects
if msg.content.get('msgtype') == 'm.image']
download_dir.mkdir(exist_ok=True)
current_stems = {p.stem for p in download_dir.glob('*')}
new_messages = [msg for msg in messages
if download_stem(msg, thumbnails)
not in current_stems]
skip_count = len(messages) - len(new_messages)
if skip_count:
print(f"Skipping {skip_count} already-downloaded {noun}")
if new_messages:
print(f"Downloading {len(new_messages)} new {noun}...")
else:
print("Nothing to do")
run_downloads(new_messages, download_dir, prefer_thumbnails=thumbnails)
if __name__ == '__main__':
download_images()

+ 80
- 0
export_messages.py View File

@ -0,0 +1,80 @@
import json
import os
import re
from pathlib import Path
from urllib.parse import urlparse
import click
import yaml
from jinja2 import Template
import database_connection # noqa: F401
from matrix_connection import get_download_url
from schema import Message
MATRIX_ROOM_IDS = os.environ['MATRIX_ROOM_IDS'].split(',')
ARCHIVE_FORMATS = ['txt', 'html', 'json', 'yaml']
def encode_message(message):
data = message._data.copy()
data.pop('id')
data['sender'] = re.sub(r'@(.+):.+', r'\1', data['sender'])
data['timestamp'] = data['timestamp'].isoformat()
content = data['content']
if 'url' in content:
content['url'] = get_download_url(content['url'])
return data
def replace_by_local_image(data):
data = data.copy()
content = data['content']
if content.get('msgtype') == 'm.image' and 'info' in content:
url = content['file']['url'] if 'file' in content else content['url']
mimetype = content['info']['mimetype']
if 'thumbnail_url' in content['info'] and content['info']['thumbnail_url'] != '':
url, mimetype = content['info']['thumbnail_url'], content['info']['thumbnail_info']['mimetype']
_, subtype = mimetype.split('/', 2)
url = urlparse(url)
content['url'] = 'thumbnails/' + url.path.strip('/') + '.' + subtype
return data
def dump_html_archive(data, fp, template_path):
template = Template(Path(template_path).read_text())
fp.write(template.render(messages=data))
@click.command()
@click.option('--room-id')
@click.option('--local-images/--no-local-images', default=True)
@click.argument('filename', default='archive.html')
def export_archive(room_id, local_images, filename):
if room_id and not re.match(r'!.+:matrix.org', room_id):
from matrix_connection import matrix_client
rooms = matrix_client().get_rooms()
room_id = next(id for id, room in rooms.items() if room_id in room.display_name)
if not room_id:
room_id, *_ = MATRIX_ROOM_IDS
fmt = Path(filename).suffix.lstrip('.')
if fmt not in ARCHIVE_FORMATS:
raise click.BadParameter(f"{fmt} is not in {ARCHIVE_FORMATS}")
messages = Message.objects(room_id=room_id).order_by('timestamp')
data = map(encode_message, messages)
print(f"Writing {len(messages)} messages to {filename!r}")
with open(filename, 'w') as fp:
if fmt in ('text', 'txt', 'html'):
if local_images:
data = map(replace_by_local_image, data)
template_path = f'templates/default.{fmt}.tpl'
dump_html_archive(data, fp, template_path=template_path)
elif fmt == 'json':
json.dump(list(data), fp, indent=2)
elif fmt == 'yaml':
yaml.dump(list(data), fp, default_flow_style=None)
if __name__ == '__main__':
export_archive()

+ 86
- 0
import_messages.py View File

@ -0,0 +1,86 @@
import os
from datetime import datetime
from itertools import islice
import click
import database_connection # noqa: F401
from matrix_connection import matrix_client
from mongoengine.errors import FieldDoesNotExist, ValidationError
from schema import Message
MATRIX_ROOM_IDS = os.environ['MATRIX_ROOM_IDS'].split(',')
MESSAGE_EVENT_TYPES = {'m.room.message', 'm.room.message.feedback'}
def get_room_events(room_id):
"""Iterate room events, starting at the cursor."""
room = matrix_client().get_rooms()[room_id]
print(f"Reading events from room {room.display_name!r}…")
yield from room.events
batch_size = 1000 # empirically, this is the largest honored value
prev_batch = room.prev_batch
while True:
res = room.client.api.get_room_messages(room.room_id, prev_batch, 'b',
limit=batch_size)
events = res['chunk']
if not events:
break
print(f"Read {len(events)} events...")
yield from events
prev_batch = res['end']
def import_events(room_id, limit=None):
events = get_room_events(room_id)
# restrict to messages
messages = (event for event in events if event['type'] in MESSAGE_EVENT_TYPES)
# exclude redacted messages
messages = (event for event in messages if 'redacted_because' not in event)
# exclude messages that have already been saved
messages = (event for event in messages
if not Message.objects(event_id=event['event_id'],
room_id=event['room_id']))
if limit:
messages = islice(messages, limit)
for event in messages:
fields = event.copy()
fields['messageType'] = fields.pop('type')
fields['room_id'] = room_id
fields['timestamp'] = datetime.fromtimestamp(
fields.pop('origin_server_ts') / 1000)
fields.pop('age', None)
fields.pop('unsigned', None)
try:
message = Message(**replace_dots(fields))
message.save()
except (FieldDoesNotExist, ValidationError):
print(fields)
raise
yield message
def replace_dots(obj):
"""Recursively replace '.' by '' in dictionary key names, to avoid mongodb
error.
"""
return {k.replace('.', ''): replace_dots(v) for k, v in obj.items()} \
if isinstance(obj, dict) \
else obj
@click.command()
@click.option('--limit', type=int)
def cli(limit):
"""Import events."""
for room_id in MATRIX_ROOM_IDS:
import_count = sum(1 for _ in import_events(room_id, limit))
print(f"Imported {import_count} messages")
print(f"The database now has {Message.objects.count()} messages")
if __name__ == '__main__':
cli()

+ 23
- 0
list_rooms.py View File

@ -0,0 +1,23 @@
import re
import click
from matrix_connection import matrix_client
from tabulate import tabulate
@click.command()
@click.argument('pattern', required=False, type=str)
def list_rooms(pattern):
"""List room ids and keys."""
rooms = matrix_client().get_rooms()
data = [(rid, room.display_name)
for rid, room in rooms.items()]
if pattern:
data = [(rid, name) for rid, name in data
if re.search(pattern.strip('/'), name)]
print(tabulate(data, headers=['Room ID', 'Display Name']))
if __name__ == '__main__':
list_rooms()

+ 36
- 0
matrix_connection.py View File

@ -0,0 +1,36 @@
import os
from urllib.parse import urlparse
from matrix_client.client import MatrixClient
MATRIX_USER = os.environ['MATRIX_USER']
MATRIX_PASSWORD = os.environ['MATRIX_PASSWORD']
MATRIX_HOST = os.environ.get('MATRIX_HOST', "https://matrix.org")
_client = None
_download_url_resolvers = dict()
def matrix_client():
global _client
if _client:
return _client
print(f"Signing into {MATRIX_HOST}...")
client = MatrixClient(MATRIX_HOST)
client.login_with_password(username=MATRIX_USER,
password=MATRIX_PASSWORD)
_client = client
return client
def get_download_url(url):
u = urlparse(url)
assert u.scheme == 'mxc'
host = u.netloc
resolvers = _download_url_resolvers
resolver = resolvers.get(host) or MatrixClient(host).api.get_download_url
resolvers[host] = resolver
return 'https://' + resolver(url)
get_matrix_download_url = MatrixClient(MATRIX_HOST).api.get_download_url

+ 25
- 0
schema.py View File

@ -0,0 +1,25 @@
from mongoengine import DateTimeField, Document, DynamicField, StringField, BooleanField
class Message(Document):
room_id = StringField(r'!.+:.+', required=True)
event_id = StringField(r'\$.+', required=True, unique_with='room_id')
sender = StringField(r'@.+:.+', required=True)
user_id = StringField(r'@.+:.+', required=False)
messageType = StringField(r'm\.room\.message', db_field='type', required=True)
timestamp = DateTimeField(required=True)
content = DynamicField(required=True)
verified = BooleanField(required=False)
decrypted = BooleanField(required=False)
def is_image(self):
return self.content.get('msgtype') == 'm.image'
@property
def image_url(self):
return self.content['url'] if self.is_image() else None
@property
def thumbnail_url(self):
return (self.content['info'].get('thumbnail_url')
if self.is_image() else None)

+ 3
- 0
setup.cfg View File

@ -0,0 +1,3 @@
[flake8]
ignore = D100,D101,D102,D103,D104
max-line-length = 88

+ 19
- 0
templates/default.html.tpl View File

@ -0,0 +1,19 @@
<meta charset="UTF-8">
{% for message in messages %}
{% set content = message.content %}
<div class="message">
<dl>
<dt>From</dt>
<dd>{{ message.sender }}</dd>
<dt>Date</dt>
<dd>{{ message.timestamp }}</dd>
</dl>
{% if content.msgtype == 'm.text' %}
<div class="body">{{ content.body }}</div>
{% elif content.msgtype == 'm.image' %}
<div class="body"><img src="{{ content.url }}" /></div>
{% else %}
<div class="error">Unknown message type<div>
{% endif %}
</div>
{% endfor %}

+ 13
- 0
templates/default.txt.tpl View File

@ -0,0 +1,13 @@
{% for message in messages -%}
{%- set content = message.content -%}
From {{ message.sender }}
Date {{ message.timestamp }}
{% if content.msgtype == 'm.text' %}
{{ content.body }}
{%- elif content.msgtype == 'm.image' -%}
Image: {{ content.url }}
{%- else -%}
Unknown type: {{ content.msgtype }}
{%- endif %}
---
{% endfor %}

Loading…
Cancel
Save