lnav/src/text_anonymizer.cc

/**
 * Copyright (c) 2022, Timothy Stack
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * * Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 * * Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 * * Neither the name of Timothy Stack nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "text_anonymizer.hh"

#include <arpa/inet.h>
#include <curl/curl.h>
#include <netinet/in.h>

#include "animals-json.h"
#include "config.h"
#include "data_scanner.hh"
#include "diseases-json.h"
#include "ghc/filesystem.hpp"
#include "hasher.hh"
#include "pcrepp/pcre2pp.hh"
#include "words-json.h"
#include "yajlpp/yajlpp_def.hh"

namespace lnav {

struct random_list {
    std::vector<std::string> rl_data;

    std::string at_index(size_t index) const
    {
        auto counter = index / this->rl_data.size();
        auto mod = index % this->rl_data.size();

        auto retval = this->rl_data[mod];
        if (counter > 0) {
            retval = fmt::format(FMT_STRING("{}{}"), retval, counter);
        }
        return retval;
    }
};

static const typed_json_path_container<random_list> random_list_handlers = {
    yajlpp::property_handler("data#").for_field(&random_list::rl_data),
};

static random_list
load_word_list()
{
    static const intern_string_t name
        = intern_string::lookup(words_json.get_name());
    auto parse_res
        = random_list_handlers.parser_for(name).with_ignore_unused(false).of(
            words_json.to_string_fragment());

    return parse_res.unwrap();
}

static const random_list&
get_word_list()
{
    static const auto retval = load_word_list();

    return retval;
}

static random_list
load_animal_list()
{
    static const intern_string_t name
        = intern_string::lookup(animals_json.get_name());
    auto parse_res
        = random_list_handlers.parser_for(name).with_ignore_unused(false).of(
            animals_json.to_string_fragment());

    return parse_res.unwrap();
}

static const random_list&
get_animal_list()
{
    static const auto retval = load_animal_list();

    return retval;
}

static random_list
load_disease_list()
{
    static const intern_string_t name
        = intern_string::lookup(diseases_json.get_name());
    auto parse_res
        = random_list_handlers.parser_for(name).with_ignore_unused(false).of(
            diseases_json.to_string_fragment());

    return parse_res.unwrap();
}

static const random_list&
get_disease_list()
{
    static const auto retval = load_disease_list();

    return retval;
}

std::string
text_anonymizer::next(string_fragment line)
{
    data_scanner ds(line);
    std::string retval;

    while (true) {
        auto tok_res = ds.tokenize2();
        if (!tok_res) {
            break;
        }

        switch (tok_res->tr_token) {
            case DT_URL: {
                auto url_str = tok_res->to_string();
                auto_mem<CURLU> cu(curl_url_cleanup);
                cu = curl_url();

                if (curl_url_set(cu, CURLUPART_URL, url_str.c_str(), 0)
                    != CURLUE_OK)
                {
                    retval += "<unparseable-url>";
                } else {
                    auto_mem<char> url_part(curl_free);

                    if (curl_url_get(
                            cu, CURLUPART_USER, url_part.out(), CURLU_URLDECODE)
                        == CURLUE_OK)
                    {
                        auto anon_user = this->get_default(
                            this->ta_user_names,
                            url_part.in(),
                            [](size_t size, auto& user) {
                                return get_animal_list().at_index(size);
                            });
                        curl_url_set(cu,
                                     CURLUPART_USER,
                                     anon_user.c_str(),
                                     CURLU_URLENCODE);
                    }

                    if (curl_url_get(cu,
                                     CURLUPART_PASSWORD,
                                     url_part.out(),
                                     CURLU_URLDECODE)
                        == CURLUE_OK)
                    {
                        auto anon_pass
                            = hasher()
                                  .update(url_part.in(), strlen(url_part.in()))
                                  .to_string();
                        curl_url_set(cu,
                                     CURLUPART_PASSWORD,
                                     anon_pass.c_str(),
                                     CURLU_URLENCODE);
                    }

                    if (curl_url_get(
                            cu, CURLUPART_HOST, url_part.out(), CURLU_URLDECODE)
                        == CURLUE_OK)
                    {
                        auto anon_host = this->get_default(
                            this->ta_host_names,
                            url_part.in(),
                            [](size_t size, auto& hn) {
                                const auto& diseases = get_disease_list();

                                return fmt::format(FMT_STRING("{}.example.com"),
                                                   diseases.at_index(size));
                            });
                        curl_url_set(cu,
                                     CURLUPART_HOST,
                                     anon_host.c_str(),
                                     CURLU_URLENCODE);
                    }

                    if (curl_url_get(
                            cu, CURLUPART_PATH, url_part.out(), CURLU_URLDECODE)
                        == CURLUE_OK)
                    {
                        ghc::filesystem::path url_path(url_part.in());
                        ghc::filesystem::path anon_path;

                        for (const auto& comp : url_path) {
                            if (comp == comp.root_path()) {
                                anon_path = anon_path / comp;
                                continue;
                            }
                            anon_path = anon_path / this->next(comp.string());
                        }
                        curl_url_set(cu,
                                     CURLUPART_PATH,
                                     anon_path.c_str(),
                                     CURLU_URLENCODE);
                    }

                    if (curl_url_get(cu,
                                     CURLUPART_QUERY,
                                     url_part.out(),
                                     CURLU_URLDECODE)
                        == CURLUE_OK)
                    {
                        static const auto SPLIT_RE
                            = lnav::pcre2pp::code::from_const(R"((&))");

                        curl_url_set(cu, CURLUPART_QUERY, nullptr, 0);

                        auto url_query
                            = string_fragment::from_c_str(url_part.in());
                        auto replacer = [this, &cu](const std::string& comp) {
                            std::string anon_query;

                            auto eq_index = comp.find('=');
                            if (eq_index != std::string::npos) {
                                auto new_key
                                    = this->next(comp.substr(0, eq_index));
                                auto new_value
                                    = this->next(comp.substr(eq_index + 1));
                                anon_query = fmt::format(
                                    FMT_STRING("{}={}"), new_key, new_value);
                            } else {
                                anon_query = this->next(comp);
                            }

                            curl_url_set(cu,
                                         CURLUPART_QUERY,
                                         anon_query.c_str(),
                                         CURLU_URLENCODE | CURLU_APPENDQUERY);
                        };

                        auto loop_res
                            = SPLIT_RE.capture_from(url_query).for_each(
                                [&replacer](lnav::pcre2pp::match_data& md) {
                                    replacer(md.leading().to_string());
                                });
                        if (loop_res.isOk()) {
                            replacer(loop_res.unwrap().to_string());
                        }
                    }

                    if (curl_url_get(cu,
                                     CURLUPART_FRAGMENT,
                                     url_part.out(),
                                     CURLU_URLDECODE)
                        == CURLUE_OK)
                    {
                        auto anon_frag = this->next(
                            string_fragment::from_c_str(url_part.in()));

                        curl_url_set(cu,
                                     CURLUPART_FRAGMENT,
                                     anon_frag.c_str(),
                                     CURLU_URLENCODE);
                    }

                    auto_mem<char> anon_url(curl_free);
                    if (curl_url_get(cu, CURLUPART_URL, anon_url.out(), 0)
                        == CURLUE_OK)
                    {
                        retval.append(anon_url.in());
                    }
                }
                break;
            }
            case DT_PATH: {
                ghc::filesystem::path inp_path(tok_res->to_string());
                ghc::filesystem::path anon_path;

                for (const auto& comp : inp_path) {
                    auto comp_str = comp.string();
                    if (comp == comp.root_path() || comp == inp_path) {
                        anon_path = anon_path / comp;
                        continue;
                    }
                    anon_path = anon_path / this->next(comp_str);
                }

                retval += anon_path.string();
                break;
            }
            case DT_CREDIT_CARD_NUMBER: {
                auto cc = tok_res->to_string();
                auto has_spaces = cc.size() > 16;
                auto new_end = std::remove_if(
                    cc.begin(), cc.end(), [](auto ch) { return ch == ' '; });
                cc.erase(new_end, cc.end());
                auto anon_cc = hasher().update(cc).to_string().substr(0, 16);

                if (has_spaces) {
                    anon_cc.insert(12, " ");
                    anon_cc.insert(8, " ");
                    anon_cc.insert(4, " ");
                }

                retval += anon_cc;
                break;
            }
            case DT_MAC_ADDRESS: {
                // 00-00-5E-00-53-00
                auto mac_addr = tok_res->to_string();

                retval += this->get_default(
                    this->ta_mac_addresses,
                    mac_addr,
                    [](size_t size, auto& inp) {
                        uint32_t base_mac = 0x5e005300;

                        base_mac += size;
                        auto anon_mac = byte_array<6>::from({
                            0x00,
                            0x00,
                            (unsigned char) ((base_mac >> 24) & 0xff),
                            (unsigned char) ((base_mac >> 16) & 0xff),
                            (unsigned char) ((base_mac >> 8) & 0xff),
                            (unsigned char) ((base_mac >> 0) & 0xff),
                        });

                        return anon_mac.to_string(
                            std::make_optional(inp[2]));
                    });
                break;
            }
            case DT_HEX_DUMP: {
                auto hex_str = tok_res->to_string();
                auto hash_str = hasher().update(hex_str).to_array().to_string(
                    std::make_optional(hex_str[2]));
                std::string anon_hex;

                while (anon_hex.size() < hex_str.size()) {
                    anon_hex += hash_str;
                }
                anon_hex.resize(hex_str.size());

                retval += anon_hex;
                break;
            }
            case DT_IPV4_ADDRESS: {
                auto ipv4 = tok_res->to_string();
                retval += this->get_default(
                    this->ta_ipv4_addresses, ipv4, [](size_t size, auto& _) {
                        char anon_ipv4[INET_ADDRSTRLEN];
                        struct in_addr ia;

                        inet_aton("10.0.0.0", &ia);
                        ia.s_addr = htonl(ntohl(ia.s_addr) + 1 + size);
                        inet_ntop(AF_INET, &ia, anon_ipv4, sizeof(anon_ipv4));
                        return std::string{anon_ipv4};
                    });
                break;
            }
            case DT_IPV6_ADDRESS: {
                auto ipv6 = tok_res->to_string();
                retval += this->get_default(
                    this->ta_ipv6_addresses, ipv6, [](size_t size, auto& _) {
                        char anon_ipv6[INET6_ADDRSTRLEN];
                        struct in6_addr ia;
                        uint32_t* ia6_addr32 = (uint32_t*) &ia.s6_addr[12];

                        inet_pton(AF_INET6, "2001:db8::", &ia);
                        *ia6_addr32 = htonl(ntohl(*ia6_addr32) + 1 + size);
                        inet_ntop(AF_INET6, &ia, anon_ipv6, sizeof(anon_ipv6));
                        return std::string{anon_ipv6};
                    });
                break;
            }
            case DT_EMAIL: {
                auto email_addr = tok_res->to_string();
                auto at_index = email_addr.find('@');

                retval += fmt::format(
                    FMT_STRING("{}@{}.example.com"),
                    this->get_default(this->ta_user_names,
                                      email_addr.substr(0, at_index),
                                      [](auto size, const auto& inp) {
                                          return get_animal_list().at_index(
                                              size);
                                      }),
                    this->get_default(this->ta_host_names,
                                      email_addr.substr(at_index + 1),
                                      [](auto size, const auto& inp) {
                                          return get_disease_list().at_index(
                                              size);
                                      }));
                break;
            }
            case DT_WORD:
            case DT_SYMBOL: {
                static const auto SPLIT_RE = lnav::pcre2pp::code::from_const(
                    R"((\.|::|_|-|/|\\|\d+))");
                auto symbol_frag = ds.to_string_fragment(tok_res->tr_capture);
                auto sym_provider = [](auto size, const auto& inp) {
                    if (inp.size() <= 4) {
                        return inp;
                    }

                    auto comp_frag = string_fragment::from_str(inp);
                    return string_fragment::from_str(
                               get_word_list().at_index(size))
                        .to_string_with_case_style(
                            comp_frag.detect_text_case_style());
                };

                auto cap_res
                    = SPLIT_RE.capture_from(symbol_frag)
                          .for_each([this, &retval, &sym_provider](
                                        lnav::pcre2pp::match_data& md) {
                              auto comp = md.leading().to_string();
                              retval
                                  += this->get_default(
                                         this->ta_symbols, comp, sym_provider)
                                  + md[0]->to_string();
                          });
                if (cap_res.isErr()) {
                    retval += "<symbol>";
                } else {
                    auto remaining = cap_res.unwrap().to_string();

                    retval += this->get_default(
                        this->ta_symbols, remaining, sym_provider);
                }
                break;
            }
            case DT_QUOTED_STRING: {
                auto anon_inner = this->next(
                    ds.to_string_fragment(tok_res->tr_inner_capture)
                        .to_string());

                retval += line.sub_range(tok_res->tr_capture.c_begin,
                                         tok_res->tr_inner_capture.c_begin)
                              .to_string()
                    + anon_inner
                    + ds.to_string_fragment(tok_res->tr_capture).back();
                break;
            }
            case DT_XML_OPEN_TAG: {
                auto open_tag = tok_res->to_string();
                auto space_index = open_tag.find(' ');

                if (space_index == std::string::npos) {
                    retval += open_tag;
                } else {
                    static const auto ATTR_RE
                        = lnav::pcre2pp::code::from_const(R"([\w\-]+=)");
                    static thread_local auto md
                        = lnav::pcre2pp::match_data::unitialized();

                    auto remaining = string_fragment::from_str_range(
                        open_tag, space_index, open_tag.size());

                    retval += open_tag.substr(0, space_index + 1);
                    while (!remaining.empty()) {
                        auto cap_res = ATTR_RE.capture_from(remaining)
                                           .into(md)
                                           .matches()
                                           .ignore_error();

                        if (!cap_res) {
                            break;
                        }

                        retval += md.leading();
                        retval += md[0]->to_string();
                        remaining = md.remaining();
                        data_scanner ds(remaining);
                        auto attr_tok_res = ds.tokenize2();
                        if (!attr_tok_res) {
                            continue;
                        }
                        retval += this->next(attr_tok_res->to_string());
                        remaining = remaining.substr(
                            attr_tok_res->tr_capture.length());
                    }

                    retval += remaining.to_string();
                }
                break;
            }
            case DT_UUID: {
                retval
                    += hasher().update(tok_res->to_string()).to_uuid_string();
                break;
            }
            default: {
                log_debug("tok_re %d %d:%d",
                          tok_res->tr_token,
                          tok_res->tr_capture.c_begin,
                          tok_res->tr_capture.c_end);
                retval += tok_res->to_string();
                break;
            }
        }
    }

    return retval;
}

}  // namespace lnav