commit c68c0e138ff4b96afd0c7cfd572875a28a31c7cb Author: spike Date: Fri Aug 28 01:57:35 2020 +0100 initial diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..623ecd5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +thumbnails/ +images/ +messages.html diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..6af8cc9 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,20 @@ +MIT License + +Copyright (c) 2018 Oliver Steele + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..4ed8ce7 --- /dev/null +++ b/Pipfile @@ -0,0 +1,23 @@ +[[source]] +name = "pypi" +url = "https://pypi.python.org/simple" +verify_ssl = true + +[requires] +python_full_version = "3.6.5" + +[scripts] +import = "python import_messages.py" +export = "python export_messages.py" +list = "python list_rooms.py" + +[packages] +matrix_client = "*" +mongoengine = "*" +click = "*" +tabulate = "*" +pyyaml = "*" +"jinja2" = "*" +requests = "*" + +[dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..d4eaf16 --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,210 @@ +{ + "_meta": { + "hash": { + "sha256": "cc3f8517f705a63c6a297c6c461a500fb75e28dc81050219498b261f1ca157b1" + }, + "pipfile-spec": 6, + "requires": { + "python_full_version": "3.6.5" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.python.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "certifi": { + "hashes": [ + "sha256:5ad7e9a056d25ffa5082862e36f119f7f7cec6457fa07ee2f8c339814b80c9b1", + "sha256:9cd41137dc19af6a5e03b630eefe7d1f458d964d406342dd3edf625839b944cc" + ], + "version": "==2020.4.5.2" + }, + "chardet": { + "hashes": [ + "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", + "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" + ], + "version": "==3.0.4" + }, + "click": { + "hashes": [ + "sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a", + "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc" + ], + "index": "pypi", + "version": "==7.1.2" + }, + "idna": { + "hashes": [ + "sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb", + "sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa" + ], + "version": "==2.9" + }, + "jinja2": { + "hashes": [ + "sha256:89aab215427ef59c34ad58735269eb58b1a5808103067f7bb9d5836c651b3bb0", + "sha256:f0a4641d3cf955324a89c04f3d94663aa4d638abe8f733ecd3582848e1c37035" + ], + "index": "pypi", + "version": "==2.11.2" + }, + "markupsafe": { + "hashes": [ + "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473", + "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161", + "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235", + "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5", + "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42", + "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff", + "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b", + "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1", + "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e", + "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183", + "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66", + "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b", + "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1", + "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15", + "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1", + "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e", + "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b", + "sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905", + "sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735", + "sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d", + "sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e", + "sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d", + "sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c", + "sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21", + "sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2", + "sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5", + "sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b", + "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6", + "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f", + "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f", + "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2", + "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7", + "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be" + ], + "version": "==1.1.1" + }, + "matrix-client": { + "hashes": [ + "sha256:2855a2614a177db66f9bc3ba38cbd2876041456f663c334f72a160ab6bb11c49", + "sha256:dce3ccb8665df0d519f08e07a16e6d3f9fab3a947df4b7a7c4bb26573d68f2d5" + ], + "index": "pypi", + "version": "==0.3.2" + }, + "mongoengine": { + "hashes": [ + "sha256:6e127f45f71c2bc5e72461ec297a0c20f04c3ee0bf6dd869e336226e325db6ef", + "sha256:db9e5d587e5d74e52851e0e4a53fd744725bfa9918ae6070139f5ba9c62c6edf" + ], + "index": "pypi", + "version": "==0.20.0" + }, + "pymongo": { + "hashes": [ + "sha256:01b4e10027aef5bb9ecefbc26f5df3368ce34aef81df43850f701e716e3fe16d", + "sha256:0fc5aa1b1acf7f61af46fe0414e6a4d0c234b339db4c03a63da48599acf1cbfc", + "sha256:1396eb7151e0558b1f817e4b9d7697d5599e5c40d839a9f7270bd90af994ad82", + "sha256:18e84a3ec5e73adcb4187b8e5541b2ad61d716026ed9863267e650300d8bea33", + "sha256:19adf2848b80cb349b9891cc854581bbf24c338be9a3260e73159bdeb2264464", + "sha256:20ee0475aa2ba437b0a14806f125d696f90a8433d820fb558fdd6f052acde103", + "sha256:26798795097bdeb571f13942beef7e0b60125397811c75b7aa9214d89880dd1d", + "sha256:26e707a4eb851ec27bb969b5f1413b9b2eac28fe34271fa72329100317ea7c73", + "sha256:2a3c7ad01553b27ec553688a1e6445e7f40355fb37d925c11fcb50b504e367f8", + "sha256:2f07b27dbf303ea53f4147a7922ce91a26b34a0011131471d8aaf73151fdee9a", + "sha256:316f0cf543013d0c085e15a2c8abe0db70f93c9722c0f99b6f3318ff69477d70", + "sha256:31d11a600eea0c60de22c8bdcb58cda63c762891facdcb74248c36713240987f", + "sha256:334ef3ffd0df87ea83a0054454336159f8ad9c1b389e19c0032d9cb8410660e6", + "sha256:358ba4693c01022d507b96a980ded855a32dbdccc3c9331d0667be5e967f30ed", + "sha256:3a6568bc53103df260f5c7d2da36dffc5202b9a36c85540bba1836a774943794", + "sha256:444bf2f44264578c4085bb04493bfed0e5c1b4fe7c2704504d769f955cc78fe4", + "sha256:47a00b22c52ee59dffc2aad02d0bbfb20c26ec5b8de8900492bf13ad6901cf35", + "sha256:4c067db43b331fc709080d441cb2e157114fec60749667d12186cc3fc8e7a951", + "sha256:4c092310f804a5d45a1bcaa4191d6d016c457b6ed3982a622c35f729ff1c7f6b", + "sha256:53b711b33134e292ef8499835a3df10909c58df53a2a0308f598c432e9a62892", + "sha256:568d6bee70652d8a5af1cd3eec48b4ca1696fb1773b80719ebbd2925b72cb8f6", + "sha256:56fa55032782b7f8e0bf6956420d11e2d4e9860598dfe9c504edec53af0fc372", + "sha256:5a2c492680c61b440272341294172fa3b3751797b1ab983533a770e4fb0a67ac", + "sha256:61235cc39b5b2f593086d1d38f3fc130b2d125bd8fc8621d35bc5b6bdeb92bd2", + "sha256:619ac9aaf681434b4d4718d1b31aa2f0fce64f2b3f8435688fcbdc0c818b6c54", + "sha256:6238ac1f483494011abde5286282afdfacd8926659e222ba9b74c67008d3a58c", + "sha256:63752a72ca4d4e1386278bd43d14232f51718b409e7ac86bcf8810826b531113", + "sha256:6fdc5ccb43864065d40dd838437952e9e3da9821b7eac605ba46ada77f846bdf", + "sha256:7abc3a6825a346fa4621a6f63e3b662bbb9e0f6ffc32d30a459d695f20fb1a8b", + "sha256:7aef381bb9ae8a3821abd7f9d4d93978dbd99072b48522e181baeffcd95b56ae", + "sha256:80df3caf251fe61a3f0c9614adc6e2bfcffd1cd3345280896766712fb4b4d6d7", + "sha256:95f970f34b59987dee6f360d2e7d30e181d58957b85dff929eee4423739bd151", + "sha256:993257f6ca3cde55332af1f62af3e04ca89ce63c08b56a387cdd46136c72f2fa", + "sha256:9c0a57390549affc2b5dda24a38de03a5c7cbc58750cd161ff5d106c3c6eec80", + "sha256:a0794e987d55d2f719cc95fcf980fc62d12b80e287e6a761c4be14c60bd9fecc", + "sha256:a3b98121e68bf370dd8ea09df67e916f93ea95b52fc010902312168c4d1aff5d", + "sha256:a60756d55f0887023b3899e6c2923ba5f0042fb11b1d17810b4e07395404f33e", + "sha256:a676bd2fbc2309092b9bbb0083d35718b5420af3a42135ebb1e4c3633f56604d", + "sha256:a732838c78554c1257ff2492f5c8c4c7312d0aecd7f732149e255f3749edd5ee", + "sha256:ae65d65fde4135ef423a2608587c9ef585a3551fc2e4e431e7c7e527047581be", + "sha256:b070a4f064a9edb70f921bfdc270725cff7a78c22036dd37a767c51393fb956f", + "sha256:b6da85949aa91e9f8c521681344bd2e163de894a5492337fba8b05c409225a4f", + "sha256:bbf47110765b2a999803a7de457567389253f8670f7daafb98e059c899ce9764", + "sha256:c06b3f998d2d7160db58db69adfb807d2ec307e883e2f17f6b87a1ef6c723f11", + "sha256:c318fb70542be16d3d4063cde6010b1e4d328993a793529c15a619251f517c39", + "sha256:c4aef42e5fa4c9d5a99f751fb79caa880dac7eaf8a65121549318b984676a1b7", + "sha256:c9ca545e93a9c2a3bdaa2e6e21f7a43267ff0813e8055adf2b591c13164c0c57", + "sha256:da2c3220eb55c4239dd8b982e213da0b79023cac59fe54ca09365f2bc7e4ad32", + "sha256:dd8055da300535eefd446b30995c0813cc4394873c9509323762a93e97c04c03", + "sha256:e2b46e092ea54b732d98c476720386ff2ccd126de1e52076b470b117bff7e409", + "sha256:e334c4f39a2863a239d38b5829e442a87f241a92da9941861ee6ec5d6380b7fe", + "sha256:e5c54f04ca42bbb5153aec5d4f2e3d9f81e316945220ac318abd4083308143f5", + "sha256:f96333f9d2517c752c20a35ff95de5fc2763ac8cdb1653df0f6f45d281620606" + ], + "version": "==3.10.1" + }, + "pyyaml": { + "hashes": [ + "sha256:06a0d7ba600ce0b2d2fe2e78453a470b5a6e000a985dd4a4e54e436cc36b0e97", + "sha256:240097ff019d7c70a4922b6869d8a86407758333f02203e0fc6ff79c5dcede76", + "sha256:4f4b913ca1a7319b33cfb1369e91e50354d6f07a135f3b901aca02aa95940bd2", + "sha256:69f00dca373f240f842b2931fb2c7e14ddbacd1397d57157a9b005a6a9942648", + "sha256:73f099454b799e05e5ab51423c7bcf361c58d3206fa7b0d555426b1f4d9a3eaf", + "sha256:74809a57b329d6cc0fdccee6318f44b9b8649961fa73144a98735b0aaf029f1f", + "sha256:7739fc0fa8205b3ee8808aea45e968bc90082c10aef6ea95e855e10abf4a37b2", + "sha256:95f71d2af0ff4227885f7a6605c37fd53d3a106fcab511b8860ecca9fcf400ee", + "sha256:b8eac752c5e14d3eca0e6dd9199cd627518cb5ec06add0de9d32baeee6fe645d", + "sha256:cc8955cfbfc7a115fa81d85284ee61147059a753344bc51098f3ccd69b0d7e0c", + "sha256:d13155f591e6fcc1ec3b30685d50bf0711574e2c0dfffd7644babf8b5102ca1a" + ], + "index": "pypi", + "version": "==5.3.1" + }, + "requests": { + "hashes": [ + "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee", + "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6" + ], + "index": "pypi", + "version": "==2.23.0" + }, + "tabulate": { + "hashes": [ + "sha256:ac64cb76d53b1231d364babcd72abbb16855adac7de6665122f97b593f1eb2ba", + "sha256:db2723a20d04bcda8522165c73eea7c300eda74e0ce852d9022e0159d7895007" + ], + "index": "pypi", + "version": "==0.8.7" + }, + "urllib3": { + "hashes": [ + "sha256:3018294ebefce6572a474f0604c2021e33b3fd8006ecd11d62107a5d2a963527", + "sha256:88206b0eb87e6d677d424843ac5209e3fb9d0190d0ee169599165ec25e9d9115" + ], + "version": "==1.25.9" + } + }, + "develop": {} +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..ee5eb02 --- /dev/null +++ b/README.md @@ -0,0 +1,53 @@ +# Matrix Archive Tools + +Import messages from a matrix.org room, for research, archival, and +preservation. + +Developed at [Dinacon 2018](https://www.dinacon.org), for use by the +documentation team. + +Use this responsibly and ethically. Don't re-publish people's messages +without their knowledge and consent. + +## Setup + +Install Pipenv. Run `pipenv install`. + +Set these environment variables: `MATRIX_USER`, `MATRIX_PASSWORD`, +`MATRIX_ROOM_IDS`. + +`MATRIX_ROOM_IDS` should be a comma-separated list of Matrix room IDs (or a +single id). Run `pipenv run list_rooms.py` to list the room ids. + +Set `MONGODB_URI` to a MongoDB connection URL, *or* install a local MongoDB +instance. + +## Usage + +### Import Messages + +`pipenv run import` imports the messages into the database. + +### Export Messages + +`pipenv run export filename.html` exports a text, HTML, JSON, or YAML file, +depending on the name of `filename.html`. The file contains links to the image +download URLs on the Matrix server. + +### Download Images + +`pipenv run download_images.py` downloads all the thumbnail images in the +database into a download directory (default `thumbnails`), skipping images that +have already been downloaded. + +Use the `--no-thumbnails` option to download full size images instead of +thumbnails. In this case, the default directory is `images` instead of +`thumbnails`. + +## References + +[Matrix Client-Server API](https://matrix.org/docs/spec/r0.0.0/client_server.html) + +## License + +MIT diff --git a/__pycache__/database_connection.cpython-38.pyc b/__pycache__/database_connection.cpython-38.pyc new file mode 100644 index 0000000..8253d0b Binary files /dev/null and b/__pycache__/database_connection.cpython-38.pyc differ diff --git a/__pycache__/matrix_connection.cpython-38.pyc b/__pycache__/matrix_connection.cpython-38.pyc new file mode 100644 index 0000000..112ba82 Binary files /dev/null and b/__pycache__/matrix_connection.cpython-38.pyc differ diff --git a/__pycache__/schema.cpython-38.pyc b/__pycache__/schema.cpython-38.pyc new file mode 100644 index 0000000..946068f Binary files /dev/null and b/__pycache__/schema.cpython-38.pyc differ diff --git a/database_connection.py b/database_connection.py new file mode 100644 index 0000000..c532867 --- /dev/null +++ b/database_connection.py @@ -0,0 +1,18 @@ +import os +import re + +from mongoengine import connect + +MONGODB_URI = os.getenv('MONGODB_URI') +MONGO_RE = (r'mongodb://' + r'(?P.+?)' + r':(?P.+?)' + r'@(?P(?:.+?):(?:\d+))' + r'/(?P.+)') + +if MONGODB_URI: + print(f"Connecting to {MONGODB_URI}") + connect_args = re.match(MONGO_RE, MONGODB_URI).groupdict() + connect(**connect_args) +else: + connect('matrix') diff --git a/download_images.py b/download_images.py new file mode 100644 index 0000000..7ae35db --- /dev/null +++ b/download_images.py @@ -0,0 +1,63 @@ + +from pathlib import Path +from urllib.parse import urlparse + +import click +import requests + +import database_connection # noqa: F401 +from matrix_connection import get_download_url +from schema import Message + + +def download_stem(message, prefer_thumbnails): + image_url = (message.thumbnail_url if prefer_thumbnails else None) \ + or message.image_url + return urlparse(image_url).path.lstrip('/') + + +def run_downloads(messages, download_dir, prefer_thumbnails): + for msg in messages: + image_url = (msg.thumbnail_url if prefer_thumbnails else None) or msg.image_url + res = requests.head(get_download_url(image_url)) + assert res.status_code == 200 + mtype, subtype = res.headers['content-type'].split('/', 2) + if mtype != 'image': + print(f"Skipping {image_url}: {res.headers['content-type']}") + continue + + res = requests.get(get_download_url(image_url)) + assert res.status_code == 200 + filename = (download_dir / download_stem(msg, prefer_thumbnails) + ).with_suffix('.' + subtype) + print('Downloading', image_url, '->', filename) + with open(filename, 'wb') as fp: + fp.write(res.content) + + +@click.command() +@click.option('--thumbnails/--no-thumbnails', default=True) +@click.argument('output', required=False) +def download_images(thumbnails, output): + """Download thumbnails.""" + noun = 'thumbnails' if thumbnails else 'images' + download_dir = Path(output or noun) + messages = [msg for msg in Message.objects + if msg.content.get('msgtype') == 'm.image'] + download_dir.mkdir(exist_ok=True) + current_stems = {p.stem for p in download_dir.glob('*')} + new_messages = [msg for msg in messages + if download_stem(msg, thumbnails) + not in current_stems] + skip_count = len(messages) - len(new_messages) + if skip_count: + print(f"Skipping {skip_count} already-downloaded {noun}") + if new_messages: + print(f"Downloading {len(new_messages)} new {noun}...") + else: + print("Nothing to do") + run_downloads(new_messages, download_dir, prefer_thumbnails=thumbnails) + + +if __name__ == '__main__': + download_images() diff --git a/export_messages.py b/export_messages.py new file mode 100644 index 0000000..b35d249 --- /dev/null +++ b/export_messages.py @@ -0,0 +1,80 @@ +import json +import os +import re +from pathlib import Path +from urllib.parse import urlparse + +import click +import yaml +from jinja2 import Template + +import database_connection # noqa: F401 +from matrix_connection import get_download_url +from schema import Message + +MATRIX_ROOM_IDS = os.environ['MATRIX_ROOM_IDS'].split(',') + +ARCHIVE_FORMATS = ['txt', 'html', 'json', 'yaml'] + + +def encode_message(message): + data = message._data.copy() + data.pop('id') + data['sender'] = re.sub(r'@(.+):.+', r'\1', data['sender']) + data['timestamp'] = data['timestamp'].isoformat() + content = data['content'] + if 'url' in content: + content['url'] = get_download_url(content['url']) + return data + + +def replace_by_local_image(data): + data = data.copy() + content = data['content'] + if content.get('msgtype') == 'm.image' and 'info' in content: + url = content['file']['url'] if 'file' in content else content['url'] + mimetype = content['info']['mimetype'] + if 'thumbnail_url' in content['info'] and content['info']['thumbnail_url'] != '': + url, mimetype = content['info']['thumbnail_url'], content['info']['thumbnail_info']['mimetype'] + _, subtype = mimetype.split('/', 2) + url = urlparse(url) + content['url'] = 'thumbnails/' + url.path.strip('/') + '.' + subtype + return data + + +def dump_html_archive(data, fp, template_path): + template = Template(Path(template_path).read_text()) + fp.write(template.render(messages=data)) + + +@click.command() +@click.option('--room-id') +@click.option('--local-images/--no-local-images', default=True) +@click.argument('filename', default='archive.html') +def export_archive(room_id, local_images, filename): + if room_id and not re.match(r'!.+:matrix.org', room_id): + from matrix_connection import matrix_client + rooms = matrix_client().get_rooms() + room_id = next(id for id, room in rooms.items() if room_id in room.display_name) + if not room_id: + room_id, *_ = MATRIX_ROOM_IDS + fmt = Path(filename).suffix.lstrip('.') + if fmt not in ARCHIVE_FORMATS: + raise click.BadParameter(f"{fmt} is not in {ARCHIVE_FORMATS}") + messages = Message.objects(room_id=room_id).order_by('timestamp') + data = map(encode_message, messages) + print(f"Writing {len(messages)} messages to {filename!r}") + with open(filename, 'w') as fp: + if fmt in ('text', 'txt', 'html'): + if local_images: + data = map(replace_by_local_image, data) + template_path = f'templates/default.{fmt}.tpl' + dump_html_archive(data, fp, template_path=template_path) + elif fmt == 'json': + json.dump(list(data), fp, indent=2) + elif fmt == 'yaml': + yaml.dump(list(data), fp, default_flow_style=None) + + +if __name__ == '__main__': + export_archive() diff --git a/import_messages.py b/import_messages.py new file mode 100644 index 0000000..f245e94 --- /dev/null +++ b/import_messages.py @@ -0,0 +1,86 @@ +import os +from datetime import datetime +from itertools import islice + +import click + +import database_connection # noqa: F401 +from matrix_connection import matrix_client +from mongoengine.errors import FieldDoesNotExist, ValidationError +from schema import Message + +MATRIX_ROOM_IDS = os.environ['MATRIX_ROOM_IDS'].split(',') + +MESSAGE_EVENT_TYPES = {'m.room.message', 'm.room.message.feedback'} + + +def get_room_events(room_id): + """Iterate room events, starting at the cursor.""" + room = matrix_client().get_rooms()[room_id] + print(f"Reading events from room {room.display_name!r}…") + yield from room.events + batch_size = 1000 # empirically, this is the largest honored value + prev_batch = room.prev_batch + while True: + res = room.client.api.get_room_messages(room.room_id, prev_batch, 'b', + limit=batch_size) + events = res['chunk'] + if not events: + break + print(f"Read {len(events)} events...") + yield from events + prev_batch = res['end'] + + +def import_events(room_id, limit=None): + events = get_room_events(room_id) + # restrict to messages + messages = (event for event in events if event['type'] in MESSAGE_EVENT_TYPES) + # exclude redacted messages + messages = (event for event in messages if 'redacted_because' not in event) + # exclude messages that have already been saved + messages = (event for event in messages + if not Message.objects(event_id=event['event_id'], + room_id=event['room_id'])) + if limit: + messages = islice(messages, limit) + for event in messages: + fields = event.copy() + fields['messageType'] = fields.pop('type') + fields['room_id'] = room_id + fields['timestamp'] = datetime.fromtimestamp( + fields.pop('origin_server_ts') / 1000) + fields.pop('age', None) + fields.pop('unsigned', None) + try: + message = Message(**replace_dots(fields)) + message.save() + except (FieldDoesNotExist, ValidationError): + print(fields) + raise + + yield message + + +def replace_dots(obj): + """Recursively replace '.' by '•' in dictionary key names, to avoid mongodb + error. + """ + return {k.replace('.', '•'): replace_dots(v) for k, v in obj.items()} \ + if isinstance(obj, dict) \ + else obj + + + +@click.command() +@click.option('--limit', type=int) +def cli(limit): + """Import events.""" + for room_id in MATRIX_ROOM_IDS: + import_count = sum(1 for _ in import_events(room_id, limit)) + print(f"Imported {import_count} messages") + print(f"The database now has {Message.objects.count()} messages") + + +if __name__ == '__main__': + cli() diff --git a/list_rooms.py b/list_rooms.py new file mode 100644 index 0000000..2fdf4e6 --- /dev/null +++ b/list_rooms.py @@ -0,0 +1,23 @@ +import re + +import click + +from matrix_connection import matrix_client +from tabulate import tabulate + + +@click.command() +@click.argument('pattern', required=False, type=str) +def list_rooms(pattern): + """List room ids and keys.""" + rooms = matrix_client().get_rooms() + data = [(rid, room.display_name) + for rid, room in rooms.items()] + if pattern: + data = [(rid, name) for rid, name in data + if re.search(pattern.strip('/'), name)] + print(tabulate(data, headers=['Room ID', 'Display Name'])) + + +if __name__ == '__main__': + list_rooms() diff --git a/matrix_connection.py b/matrix_connection.py new file mode 100644 index 0000000..e982fec --- /dev/null +++ b/matrix_connection.py @@ -0,0 +1,36 @@ +import os +from urllib.parse import urlparse + +from matrix_client.client import MatrixClient + +MATRIX_USER = os.environ['MATRIX_USER'] +MATRIX_PASSWORD = os.environ['MATRIX_PASSWORD'] +MATRIX_HOST = os.environ.get('MATRIX_HOST', "https://matrix.org") + +_client = None +_download_url_resolvers = dict() + + +def matrix_client(): + global _client + if _client: + return _client + print(f"Signing into {MATRIX_HOST}...") + client = MatrixClient(MATRIX_HOST) + client.login_with_password(username=MATRIX_USER, + password=MATRIX_PASSWORD) + _client = client + return client + + +def get_download_url(url): + u = urlparse(url) + assert u.scheme == 'mxc' + host = u.netloc + resolvers = _download_url_resolvers + resolver = resolvers.get(host) or MatrixClient(host).api.get_download_url + resolvers[host] = resolver + return 'https://' + resolver(url) + + +get_matrix_download_url = MatrixClient(MATRIX_HOST).api.get_download_url diff --git a/schema.py b/schema.py new file mode 100644 index 0000000..4afab35 --- /dev/null +++ b/schema.py @@ -0,0 +1,25 @@ +from mongoengine import DateTimeField, Document, DynamicField, StringField, BooleanField + + +class Message(Document): + room_id = StringField(r'!.+:.+', required=True) + event_id = StringField(r'\$.+', required=True, unique_with='room_id') + sender = StringField(r'@.+:.+', required=True) + user_id = StringField(r'@.+:.+', required=False) + messageType = StringField(r'm\.room\.message', db_field='type', required=True) + timestamp = DateTimeField(required=True) + content = DynamicField(required=True) + verified = BooleanField(required=False) + decrypted = BooleanField(required=False) + + def is_image(self): + return self.content.get('msgtype') == 'm.image' + + @property + def image_url(self): + return self.content['url'] if self.is_image() else None + + @property + def thumbnail_url(self): + return (self.content['info'].get('thumbnail_url') + if self.is_image() else None) diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..cff6795 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,3 @@ +[flake8] +ignore = D100,D101,D102,D103,D104 +max-line-length = 88 diff --git a/templates/default.html.tpl b/templates/default.html.tpl new file mode 100644 index 0000000..de908ab --- /dev/null +++ b/templates/default.html.tpl @@ -0,0 +1,19 @@ + +{% for message in messages %} +{% set content = message.content %} +
+
+
From
+
{{ message.sender }}
+
Date
+
{{ message.timestamp }}
+
+{% if content.msgtype == 'm.text' %} +
{{ content.body }}
+{% elif content.msgtype == 'm.image' %} +
+{% else %} +
Unknown message type
+{% endif %} +
+{% endfor %} diff --git a/templates/default.txt.tpl b/templates/default.txt.tpl new file mode 100644 index 0000000..ca0a107 --- /dev/null +++ b/templates/default.txt.tpl @@ -0,0 +1,13 @@ +{% for message in messages -%} +{%- set content = message.content -%} +From {{ message.sender }} +Date {{ message.timestamp }} +{% if content.msgtype == 'm.text' %} +{{ content.body }} +{%- elif content.msgtype == 'm.image' -%} +Image: {{ content.url }} +{%- else -%} +Unknown type: {{ content.msgtype }} +{%- endif %} +--- +{% endfor %}