Run visual metrics processing within the test task.

2 years ago · 9ed5e55318
parent 5597ed6e98
commit 9ed5e55318
11 changed files with 7 additions and 1291 deletions
--- a/taskcluster/ci/browsertime/kind.yml
+++ b/taskcluster/ci/browsertime/kind.yml
@ -37,7 +37,7 @@ job-defaults:
                    subject: '[{product_name}] Raptor-Browsertime job "{task_name}" failed'
                    to-addresses: [perftest-alerts@mozilla.com]
            default: {}
-    run-on-tasks-for: []
+    run-on-tasks-for: [github-pull-request]
    treeherder:
        kind: test
        tier: 2
@ -98,7 +98,7 @@ job-defaults:
            - linux64-ffmpeg-4.1.4
            - linux64-geckodriver
            - linux64-minidump-stackwalk
-            - linux64-node
+            - linux64-node-16
 jobs:
    tp6m:
--- a/taskcluster/ci/docker-image/kind.yml
+++ b/taskcluster/ci/docker-image/kind.yml
@ -22,6 +22,3 @@ jobs:
    ui-tests:
        parent: base
        symbol: I(ui-tests)
    visual-metrics:
        parent: base
        symbol: I(visual-metrics)
--- a/taskcluster/ci/toolchain/gecko-derived.yml
+++ b/taskcluster/ci/toolchain/gecko-derived.yml
@ -49,10 +49,10 @@ linux64-node:
        index-search:
            - gecko.cache.level-3.toolchains.v3.linux64-node-12.latest
-visual-metrics:
+linux64-node-16:
    attributes:
-        toolchain-artifact: public/visualmetrics.py
+        toolchain-artifact: public/build/node.tar.zst
-    description: "Browsertime visual metrics analysis script"
+    description: "Node.js toolchain"
    run:
        index-search:
-            - gecko.cache.level-3.content.v1.visual-metrics.latest
+            - gecko.cache.level-3.toolchains.v3.linux64-node-16.latest
--- a/taskcluster/ci/visual-metrics/kind.yml
+++ b/taskcluster/ci/visual-metrics/kind.yml
@ -1,51 +0,0 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 ---
 loader: fenix_taskgraph.loader.multi_dep:loader
 kind-dependencies:
    - browsertime
    - toolchain
 primary-dependency:
    - browsertime
 group-by: attributes
 only-for-attributes:
    - run-visual-metrics
 transforms:
    - fenix_taskgraph.transforms.visual_metrics:transforms
    - taskgraph.transforms.job:transforms
    - taskgraph.transforms.task:transforms
 job-template:
    attributes:
        nightly: true
    description: "Run visual metrics calculations on Raptor"
    run-on-projects: []
    run-on-tasks-for: []
    worker-type: b-android
    treeherder:
        tier: 2
        kind: other
    worker:
        docker-image: {in-tree: visual-metrics}
        max-run-time: 900
        artifacts:
            - type: file
              name: public/perfherder-data.json
              path: /builds/worker/artifacts/perfherder-data.json
            - type: file
              name: public/summary.json
              path: /builds/worker/artifacts/summary.json
    fetches:
        toolchain:
            - visual-metrics
    run:
        using: run-task
        command: /builds/worker/bin/run-visual-metrics.py -- --orange --perceptual --contentful --force --renderignore 5 --json --viewport
        checkout: false
        run-as-root: true
--- a/taskcluster/docker/visual-metrics/Dockerfile
+++ b/taskcluster/docker/visual-metrics/Dockerfile
@ -1,30 +0,0 @@
 FROM $DOCKER_IMAGE_PARENT
 MAINTAINER Gregory Mierzwinski <gmierzwinski@mozilla.com>
 # run-task expects to run as root
 USER root
 RUN apt-get update -qq && \
    apt-get install -y \
      ffmpeg \
      imagemagick \
      pyssim \
      python \
      python-pil
 WORKDIR /builds/worker
 USER worker:worker
 COPY requirements.txt /builds/worker/requirements.txt
 RUN pip3 install --require-hashes -r /builds/worker/requirements.txt && \
    rm /builds/worker/requirements.txt
 COPY similarity.py /builds/worker/bin/similarity.py
 COPY run-visual-metrics.py /builds/worker/bin/run-visual-metrics.py
 COPY performance-artifact-schema.json /builds/worker/performance-artifact-schema.json
 USER root
 RUN chmod +x /builds/worker/bin/run-visual-metrics.py
 VOLUME /builds/worker/artifacts/
--- a/taskcluster/docker/visual-metrics/performance-artifact-schema.json
+++ b/taskcluster/docker/visual-metrics/performance-artifact-schema.json
@ -1,230 +0,0 @@
 {
    "definitions": {
        "application_schema": {
            "properties": {
                "name": {
                    "title": "Application under performance test",
                    "enum": [
                        "firefox",
                        "chrome",
                        "chrome-m",
                        "chromium",
                        "fennec",
                        "geckoview",
                        "refbrow",
                        "fenix"
                    ],
                    "maxLength": 10,
                    "type": "string"
                },
                "version": {
                    "title": "Application's version",
                    "maxLength": 40,
                    "type": "string"
                }
            },
            "required": ["name"],
            "type": "object"
        },
        "framework_schema": {
            "properties": {
                "name": {
                    "title": "Framework name",
                    "type": "string"
                }
            },
            "type": "object"
        },
        "subtest_schema": {
            "properties": {
                "name": {
                    "title": "Subtest name",
                    "type": "string"
                },
                "publicName": {
                    "title": "Public subtest name",
                    "description": "Allows renaming test's name, without breaking existing performance data series",
                    "maxLength": 30,
                    "type": "string"
                },
                "value": {
                    "description": "Summary value for subtest",
                    "title": "Subtest value",
                    "type": "number",
                    "minimum": -1000000000000.0,
                    "maximum": 1000000000000.0
                },
                "unit": {
                    "title": "Measurement unit",
                    "type": "string",
                    "minLength": 1,
                    "maxLength": 20
                },
                "lowerIsBetter": {
                    "description": "Whether lower values are better for subtest",
                    "title": "Lower is better",
                    "type": "boolean"
                },
                "shouldAlert": {
                    "description": "Whether we should alert",
                    "title": "Should alert",
                    "type": "boolean"
                },
                "alertThreshold": {
                    "description": "% change threshold before alerting",
                    "title": "Alert threshold",
                    "type": "number",
                    "minimum": 0.0,
                    "maximum": 1000.0
                },
                "minBackWindow": {
                    "description": "Minimum back window to use for alerting",
                    "title": "Minimum back window",
                    "type": "number",
                    "minimum": 1,
                    "maximum": 255
                },
                "maxBackWindow": {
                    "description": "Maximum back window to use for alerting",
                    "title": "Maximum back window",
                    "type": "number",
                    "minimum": 1,
                    "maximum": 255
                },
                "foreWindow": {
                    "description": "Fore window to use for alerting",
                    "title": "Fore window",
                    "type": "number",
                    "minimum": 1,
                    "maximum": 255
                }
            },
            "required": [
                "name",
                "value"
            ],
            "type": "object"
        },
        "suite_schema": {
            "properties": {
                "name": {
                    "title": "Suite name",
                    "type": "string"
                },
                "publicName": {
                    "title": "Public suite name",
                    "description": "Allows renaming suite's name, without breaking existing performance data series",
                    "maxLength": 30,
                    "type": "string"
                },
                "tags": {
                    "type": "array",
                    "title": "Free form tags, which ease the grouping & searching of performance tests",
                    "description": "Similar to extraOptions, except it does not break existing performance data series",
                    "items": {
                        "type": "string",
                        "pattern": "^[a-zA-Z0-9-]{1,24}$"
                    },
                    "uniqueItems": true,
                    "maxItems": 14
                },
                "extraOptions": {
                    "type": "array",
                    "title": "Extra options used in running suite",
                    "items": {
                        "type": "string",
                        "maxLength": 100
                    },
                    "uniqueItems": true,
                    "maxItems": 8
                },
                "subtests": {
                    "items": {
                        "$ref": "#/definitions/subtest_schema"
                    },
                    "title": "Subtests",
                    "type": "array"
                },
                "value": {
                    "title": "Suite value",
                    "type": "number",
                    "minimum": -1000000000000.0,
                    "maximum": 1000000000000.0
                },
                "unit": {
                    "title": "Measurement unit",
                    "type": "string",
                    "minLength": 1,
                    "maxLength": 20
                },
                "lowerIsBetter": {
                    "description": "Whether lower values are better for suite",
                    "title": "Lower is better",
                    "type": "boolean"
                },
                "shouldAlert": {
                    "description": "Whether we should alert on this suite (overrides default behaviour)",
                    "title": "Should alert",
                    "type": "boolean"
                },
                "alertThreshold": {
                    "description": "% change threshold before alerting",
                    "title": "Alert threshold",
                    "type": "number",
                    "minimum": 0.0,
                    "maximum": 1000.0
                },
                "minBackWindow": {
                    "description": "Minimum back window to use for alerting",
                    "title": "Minimum back window",
                    "type": "integer",
                    "minimum": 1,
                    "maximum": 255
                },
                "maxBackWindow": {
                    "description": "Maximum back window to use for alerting",
                    "title": "Maximum back window",
                    "type": "integer",
                    "minimum": 1,
                    "maximum": 255
                },
                "foreWindow": {
                    "description": "Fore window to use for alerting",
                    "title": "Fore window",
                    "type": "integer",
                    "minimum": 1,
                    "maximum": 255
                }
            },
            "required": [
                "name",
                "subtests"
            ],
            "type": "object"
        }
    },
    "description": "Structure for submitting performance data as part of a job",
    "id": "https://treeherder.mozilla.org/schemas/v1/performance-artifact.json#",
    "properties": {
        "application":{
            "$ref": "#/definitions/application_schema"
        },
        "framework": {
            "$ref": "#/definitions/framework_schema"
        },
        "suites": {
            "description": "List of suite-level data submitted as part of this structure",
            "items": {
                "$ref": "#/definitions/suite_schema"
            },
            "title": "Performance suites",
            "type": "array"
        }
    },
    "required": [
        "framework",
        "suites"
    ],
    "title": "Perfherder Schema",
    "type": "object"
 }
--- a/taskcluster/docker/visual-metrics/requirements.txt
+++ b/taskcluster/docker/visual-metrics/requirements.txt
@ -1,23 +0,0 @@
 # Dependency hashes must be for python3.6
 # Direct dependencies
 attrs==19.1.0 --hash=sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79
 structlog==19.1.0 --hash=sha256:db441b81c65b0f104a7ce5d86c5432be099956b98b8a2c8be0b3fb3a7a0b1536
 voluptuous==0.11.5 --hash=sha256:303542b3fc07fb52ec3d7a1c614b329cdbee13a9d681935353d8ea56a7bfa9f1
 jsonschema==3.2.0 --hash=sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163
 numpy==1.18.3 --hash=sha256:a551d8cc267c634774830086da42e4ba157fa41dd3b93982bc9501b284b0c689
 scipy==1.4.1 --hash=sha256:386086e2972ed2db17cebf88610aab7d7f6e2c0ca30042dc9a89cf18dcc363fa
 matplotlib==3.0.3 --hash=sha256:e8d1939262aa6b36d0c51f50a50a43a04b9618d20db31e6c0192b1463067aeef
 opencv-python==4.2.0.34 --hash=sha256:dcb8da8c5ebaa6360c8555547a4c7beb6cd983dd95ba895bb78b86cc8cf3de2b
 # Transitive dependencies
 importlib_metadata==1.1.0 --hash=sha256:e6ac600a142cf2db707b1998382cc7fc3b02befb7273876e01b8ad10b9652742
 more_itertools==8.0.0 --hash=sha256:a0ea684c39bc4315ba7aae406596ef191fd84f873d2d2751f84d64e81a7a2d45
 pyrsistent==0.15.6 --hash=sha256:f3b280d030afb652f79d67c5586157c5c1355c9a58dfc7940566e28d28f3df1b
 six==1.12.0 --hash=sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c
 zipp==0.6.0 --hash=sha256:f06903e9f1f43b12d371004b4ac7b06ab39a44adc747266928ae6debfa7b3335
 cycler==0.10.0 --hash=sha256:1d8a5ae1ff6c5cf9b93e8811e581232ad8920aeec647c37316ceac982b08cb2d
 kiwisolver==1.1.0 --hash=sha256:400599c0fe58d21522cae0e8b22318e09d9729451b17ee61ba8e1e7c0346565c
 pyparsing==2.4.7 --hash=sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b
 python-dateutil==2.8.1 --hash=sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a
 setuptools==46.1.3 --hash=sha256:4fe404eec2738c20ab5841fa2d791902d2a645f32318a7850ef26f8d7215a8ee
--- a/taskcluster/docker/visual-metrics/run-visual-metrics.py
+++ b/taskcluster/docker/visual-metrics/run-visual-metrics.py
@ -1,496 +0,0 @@
 #!/usr/bin/env python3
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 """Instrument visualmetrics.py to run in parallel."""
 import argparse
 import json
 import logging
 import os
 import statistics
 import subprocess
 import sys
 import tarfile
 import time
 from concurrent.futures import ProcessPoolExecutor
 from functools import partial
 from multiprocessing import cpu_count
 from pathlib import Path
 import attr
 import structlog
 from jsonschema import validate
 from voluptuous import ALLOW_EXTRA, Required, Schema
 #: The max run time for a command (5 minutes)
 MAX_TIME = 300
 #: The directory where artifacts from this job will be placed.
 OUTPUT_DIR = Path("/", "builds", "worker", "artifacts")
 #: A job to process through visualmetrics.py
@attr.s
 class Job:
    #: The name of the test.
    test_name = attr.ib(type=str)
    #: A unique number for the job.
    count = attr.ib(type=int)
    #: The tags for this job.
    tags = attr.ib(type=str)
    #: The extra options for this job.
    extra_options = attr.ib(type=str)
    #: If true, we allow 0's in the vismet results
    accept_zero_vismet = attr.ib(type=bool)
    #: json_path: The path to the ``browsertime.json`` file on disk.
    json_path = attr.ib(type=Path)
    #: video_path: The path of the video file on disk.
    video_path = attr.ib(type=Path)
 #: The schema for validating jobs.
 JOB_SCHEMA = Schema(
    {
        Required("jobs"): [
            {
                Required("test_name"): str,
                Required("browsertime_json_path"): str,
                Required("tags"): [str],
                Required("extra_options"): [str],
                Required("accept_zero_vismet"): bool,
            }
        ],
        Required("application"): {Required("name"): str, "version": str},
        Required("extra_options"): [str],
    }
 )
 #: A partial schema for browsertime.json files.
 BROWSERTIME_SCHEMA = Schema(
    [{Required("files"): {Required("video"): [str]}}], extra=ALLOW_EXTRA
 )
 SHOULD_ALERT = {
    "ContentfulSpeedIndex": True,
    "FirstVisualChange": True,
    "LastVisualChange": True,
    "PerceptualSpeedIndex": True,
    "SpeedIndex": True,
    "videoRecordingStart": False,
 }
 with Path("/", "builds", "worker", "performance-artifact-schema.json").open() as f:
    PERFHERDER_SCHEMA = json.loads(f.read())
 def run_command(log, cmd, job_count):
    """Run a command using subprocess.check_output
    Args:
        log: The structlog logger instance.
        cmd: the command to run as a list of strings.
    Returns:
        A tuple of the process' exit status and standard output.
    """
    log.info("Running command", cmd=cmd)
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    lines = []
    res = None
    start = time.time()
    while time.time() - start <= MAX_TIME:
        time.sleep(0.1)
        output = process.stdout.readline()
        if output == b"" and process.poll() is not None:
            break
        if output:
            res = output.strip()
            lines.append(res.decode("utf-8", "ignore"))
        else:
            time.sleep(5)
    if time.time() - start > MAX_TIME:
        log.error(
            "TEST-UNEXPECTED-FAIL | Timed out waiting for response from command",
            cmd=cmd,
        )
        return 1, "Timed out"
    rc = process.poll()
    job_prefix = "[JOB-" + str(job_count) + "] "
    for line in lines:
        # Some output doesn't start with the levels because it comes
        # from FFMPEG rather than the script itself
        if line.startswith(("[INFO]", "[WARNING]", "[CRITICAL]", "[ERROR]")):
            splitline = line.split(" - ")
            level = splitline[0]
            line = " - ".join(splitline[1:])
        else:
            level = "[INFO]"
        newline = job_prefix + line
        if level.strip() in ("[ERROR]", "[CRITICAL]"):
            if rc == 0:
                rc = 1
            log.error("TEST-UNEXPECTED-FAIL | " + newline)
        elif level == "[WARNING]":
            log.warning(newline)
        else:
            log.info(newline)
    return rc, res
 def append_result(log, suites, test_name, name, result, tags, extra_options):
    """Appends a ``name`` metrics result in the ``test_name`` suite.
    Args:
        log: The structlog logger instance.
        suites: A mapping containing the suites.
        test_name: The name of the test.
        name: The name of the metrics.
        result: The value to append.
    """
    if name.endswith("Progress"):
        return
    try:
        result = int(result)
    except ValueError:
        log.error("Could not convert value", name=name)
        log.error("%s" % result)
        result = 0
    orig_test_name = test_name
    if test_name in suites and suites[test_name]["extraOptions"] != extra_options:
        missing = set(extra_options) - set(suites[test_name]["extraOptions"])
        test_name = test_name + "-".join(list(missing))
    subtests = suites.setdefault(
        test_name,
        {
            "name": orig_test_name,
            "tags": extra_options + tags + ["visual"],
            "subtests": {},
            "extraOptions": extra_options,
        },
    )["subtests"]
    if name not in subtests:
        subtests[name] = {
            "name": name,
            "replicates": [result],
            "lowerIsBetter": True,
            "unit": "ms",
            "shouldAlert": SHOULD_ALERT.get(name, False),
        }
    else:
        subtests[name]["replicates"].append(result)
 def compute_median(subtest):
    """Adds in the subtest the ``value`` field, which is the average of all
    replicates.
    Args:
        subtest: The subtest containing all replicates.
    Returns:
        The subtest.
    """
    if "replicates" not in subtest:
        return subtest
    subtest["value"] = statistics.median(subtest["replicates"])
    return subtest
 def get_suite(suite):
    """Returns the suite with computed medians in its subtests.
    Args:
        suite: The suite to convert.
    Returns:
        The suite.
    """
    suite["subtests"] = [
        compute_median(subtest) for subtest in suite["subtests"].values()
    ]
    return suite
 def read_json(json_path, schema):
    """Read the given json file and verify against the provided schema.
    Args:
        json_path: Path of json file to parse.
        schema: A callable to validate the JSON's schema.
    Returns:
        The contents of the file at ``json_path`` interpreted as JSON.
    """
    try:
        with open(str(json_path), "r", encoding="utf-8", errors="ignore") as f:
            data = json.load(f)
    except Exception:
        log.error("Could not read JSON file", path=json_path, exc_info=True)
        raise
    log.info("Loaded JSON from file", path=json_path)
    try:
        schema(data)
    except Exception:
        log.error("JSON failed to validate", exc_info=True)
        raise
    return data
 def main(log, args):
    """Run visualmetrics.py in parallel.
    Args:
        log: The structlog logger instance.
        args: The parsed arguments from the argument parser.
    Returns:
        The return code that the program will exit with.
    """
    fetch_dir = os.getenv("MOZ_FETCHES_DIR")
    if not fetch_dir:
        log.error("Expected MOZ_FETCHES_DIR environment variable.")
        return 1
    fetch_dir = Path(fetch_dir)
    visualmetrics_path = fetch_dir / "visualmetrics.py"
    if not visualmetrics_path.exists():
        log.error(
            "Could not locate visualmetrics.py", expected_path=str(visualmetrics_path)
        )
        return 1
    browsertime_results_path = fetch_dir / "browsertime-results.tgz"
    try:
        with tarfile.open(str(browsertime_results_path)) as tar:
            tar.extractall(path=str(fetch_dir))
    except Exception:
        log.error(
            "Could not read/extract browsertime results archive",
            path=browsertime_results_path,
            exc_info=True,
        )
        return 1
    log.info("Extracted browsertime results", path=browsertime_results_path)
    try:
        jobs_json_path = fetch_dir / "browsertime-results" / "jobs.json"
        jobs_json = read_json(jobs_json_path, JOB_SCHEMA)
    except Exception:
        log.error(
            "Could not open the jobs.json file", path=jobs_json_path, exc_info=True
        )
        return 1
    jobs = []
    count = 0
    for job in jobs_json["jobs"]:
        browsertime_json_path = fetch_dir / job["browsertime_json_path"]
        try:
            browsertime_json = read_json(browsertime_json_path, BROWSERTIME_SCHEMA)
        except Exception:
            log.error(
                "Could not open a browsertime.json file",
                path=browsertime_json_path,
                exc_info=True,
            )
            return 1
        for site in browsertime_json:
            for video in site["files"]["video"]:
                count += 1
                name = job["test_name"]
                if "alias" in site["info"] and site["info"]["alias"].strip() != "":
                    name = "%s.%s" % (name, site["info"]["alias"])
                jobs.append(
                    Job(
                        test_name=name,
                        tags=job["tags"],
                        extra_options=len(job["extra_options"]) > 0
                        and job["extra_options"]
                        or jobs_json["extra_options"],
                        accept_zero_vismet=job["accept_zero_vismet"],
                        json_path=browsertime_json_path,
                        video_path=browsertime_json_path.parent / video,
                        count=count,
                    )
                )
    failed_runs = 0
    suites = {}
    with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
        for job, result in zip(
            jobs,
            executor.map(
                partial(
                    run_visual_metrics,
                    visualmetrics_path=visualmetrics_path,
                    options=args.visual_metrics_options,
                ),
                jobs,
            ),
        ):
            returncode, res = result
            if returncode != 0:
                log.error(
                    "Failed to run visualmetrics.py",
                    video_path=job.video_path,
                    error=res,
                )
                failed_runs += 1
            else:
                for name, value in res.items():
                    append_result(
                        log,
                        suites,
                        job.test_name,
                        name,
                        value,
                        job.tags,
                        job.extra_options,
                    )
    suites = [get_suite(suite) for suite in suites.values()]
    perf_data = {
        "framework": {"name": "browsertime"},
        "application": jobs_json["application"],
        "type": "pageload",
        "suites": suites,
    }
    # TODO: Try to get the similarity for all possible tests, this means that we
    # will also get a comparison of recorded vs. live sites to check the on-going
    # quality of our recordings.
    # Bug 1674927 - Similarity metric is disabled until we figure out
    # why it had a huge increase in run time.
    # Validates the perf data complies with perfherder schema.
    # The perfherder schema uses jsonschema so we can't use voluptuous here.
    validate(perf_data, PERFHERDER_SCHEMA)
    raw_perf_data = json.dumps(perf_data)
    with Path(OUTPUT_DIR, "perfherder-data.json").open("w") as f:
        f.write(raw_perf_data)
    # Prints the data in logs for Perfherder to pick it up.
    log.info("PERFHERDER_DATA: %s" % raw_perf_data)
    # Lists the number of processed jobs, failures, and successes.
    with Path(OUTPUT_DIR, "summary.json").open("w") as f:
        json.dump(
            {
                "total_jobs": len(jobs),
                "successful_runs": len(jobs) - failed_runs,
                "failed_runs": failed_runs,
            },
            f,
        )
    # If there's one failure along the way, we want to return > 0
    # to trigger a red job in TC.
    return failed_runs
 def run_visual_metrics(job, visualmetrics_path, options):
    """Run visualmetrics.py on the input job.
    Returns:
       A returncode and a string containing the output of visualmetrics.py
    """
    cmd = [
        "/usr/bin/python",
        str(visualmetrics_path),
        "-vvv",
        "--logformat",
        "[%(levelname)s] - %(message)s",
        "--video",
        str(job.video_path),
    ]
    cmd.extend(options)
    rc, res = run_command(log, cmd, job.count)
    if rc == 0:
        # Python 3.5 requires a str object (not 3.6+)
        res = json.loads(res.decode("utf8"))
        failed_tests = []
        if not job.accept_zero_vismet:
            # Ensure that none of these values are at 0 which
            # is indicative of a failling test
            monitored_tests = [
                "contentfulspeedindex",
                "lastvisualchange",
                "perceptualspeedindex",
                "speedindex",
            ]
            for metric, val in res.items():
                if metric.lower() in monitored_tests and val == 0:
                    failed_tests.append(metric)
        if failed_tests:
            log.error(
                "TEST-UNEXPECTED-FAIL | Some visual metrics have an erroneous value of 0."
            )
            log.info("Tests which failed: %s" % str(failed_tests))
            rc += 1
    return rc, res
 if __name__ == "__main__":
    logging.basicConfig(format="%(levelname)s - %(message)s", level=logging.INFO)
    structlog.configure(
        processors=[
            structlog.processors.format_exc_info,
            structlog.dev.ConsoleRenderer(colors=False),
        ],
        logger_factory=structlog.stdlib.LoggerFactory(),
        cache_logger_on_first_use=True,
    )
    parser = argparse.ArgumentParser(
        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
    )
    parser.add_argument(
        "visual_metrics_options",
        type=str,
        metavar="VISUAL-METRICS-OPTIONS",
        help="Options to pass to visualmetrics.py",
        nargs="*",
    )
    args = parser.parse_args()
    log = structlog.get_logger()
    try:
        sys.exit(main(log, args))
    except Exception as e:
        log.error("Unhandled exception: %s" % e, exc_info=True)
        sys.exit(1)
--- a/taskcluster/docker/visual-metrics/similarity.py
+++ b/taskcluster/docker/visual-metrics/similarity.py
@ -1,360 +0,0 @@
 #!/usr/bin/env python3
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 import cv2
 import json
 import numpy as np
 import os
 import pathlib
 import shutil
 import socket
 import structlog
 import tarfile
 import tempfile
 import urllib
 from functools import wraps
 from matplotlib import pyplot as plt
 from scipy.stats import spearmanr
 log = None
 # We add the `and` conditions to it later
 base_ad_query = {
    "from": "task",
    "limit": 1000,
    "where": {
        "and": []
    },
    "select": [
        "action.start_time",
        "run.name",
        "task.artifacts",
        "task.group.id",
        "task.id"
    ],
 }
 def socket_timeout(value=120):
    """Decorator for socket timeouts."""
    def _socket_timeout(func):
        @wraps(func)
        def __socket_timeout(*args, **kw):
            old = socket.getdefaulttimeout()
            socket.setdefaulttimeout(value)
            try:
                return func(*args, **kw)
            finally:
                socket.setdefaulttimeout(old)
        return __socket_timeout
    return _socket_timeout
 def _open_data(file):
    return cv2.VideoCapture(str(file))
@socket_timeout(120)
 def _query_activedata(query_json):
    """Used to run queries on active data."""
    active_data_url = "http://activedata.allizom.org/query"
    req = urllib.request.Request(active_data_url)
    req.add_header("Content-Type", "application/json")
    jsondata = json.dumps(query_json)
    jsondataasbytes = jsondata.encode("utf-8")
    req.add_header("Content-Length", len(jsondataasbytes))
    log.info("Querying Active-data...")
    response = urllib.request.urlopen(req, jsondataasbytes)
    log.info("Status: %s" % {str(response.getcode())})
    data = json.loads(response.read().decode("utf8").replace("'", '"'))["data"]
    return data
@socket_timeout(120)
 def _download(url, loc):
    """Downloads from a url (with a timeout)."""
    log.info("Downloading %s" % url)
    try:
        urllib.request.urlretrieve(url, loc)
    except Exception as e:
        log.info(str(e))
        return False
    return True
 def _get_frames(video):
    """Gets all frames from a video into a list."""
    allframes = []
    while video.isOpened():
        ret, frame = video.read()
        if ret:
            # Convert to gray to simplify the process
            allframes.append(cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY))
        else:
            video.release()
            break
    return allframes
 def _get_browsertime_results(query):
    """Used to run an AD query and extract the browsertime results if they exist."""
    failed = False
    try:
        data = _query_activedata(query)
    except Exception as e:
        log.info(str(e))
        failed = True
    if failed or not data:
        log.info("Couldn't get activedata data")
        return None
    # Find the newest browsertime task
    log.info("Found %s datums" % str(len(data["action.start_time"])))
    maxind = np.argmax([float(t) for t in data["action.start_time"]])
    artifacts = data["task.artifacts"][maxind]
    btime_artifact = None
    for art in artifacts:
        if "browsertime-results" in art["name"]:
            btime_artifact = art["url"]
            break
    if not btime_artifact:
        log.info("Can't find an older site test")
        return None
    log.info("Comparing videos to TASK_GROUP=%s, TASK_ID=%s" % (
        data["task.group.id"][maxind], data["task.id"][maxind]
    ))
    # Download the browsertime videos and untar them
    tmpdir = tempfile.mkdtemp()
    loc = os.path.join(tmpdir, "tmpfile.tgz")
    if not _download(btime_artifact, loc):
        log.info(
            "Failed to download browsertime-results artifact from %s" % btime_artifact
        )
        return None
    tmploc = tempfile.mkdtemp()
    try:
        with tarfile.open(str(loc)) as tar:
            tar.extractall(path=tmploc)
    except Exception:
        log.info(
            "Could not read/extract old browsertime results archive",
            path=loc,
            exc_info=True,
        )
        return None
    return tmploc
 def _data_from_last_task(label):
    """Gets the data from the last PGO/OPT task with the same label.
    We look for both OPT and PGO tasks. The difference
    between them should be minimal. This method also provides
    a way to compare recordings from this task to another
    known task based on the TC_GROUP_ID environment varible.
    """
    label_opt = label.replace("/pgo", "/opt")
    label_pgo = label.replace("/opt", "/pgo")
    base_ad_query["where"]["and"] = [
        {"in": {"task.run.state": ["completed"]}},
        {"or": [
            {"eq": {"run.name": label_pgo}},
            {"eq": {"run.name": label_opt}}
        ]}
    ]
    task_group_id = os.getenv("TC_GROUP_ID", "")
    if task_group_id:
        base_ad_query["where"]["and"].append(
            {"eq": {"task.group.id": task_group_id}}
        )
    else:
        base_ad_query["where"]["and"].extend([
            {"in": {"repo.branch.name": ["mozilla-central"]}},
            {"gte": {"action.start_time": {"date": "today-week-week"}}},
        ])
    return _get_browsertime_results(base_ad_query)
 def _data_from_last_live_task(label):
    """Gets the data from the last live site PGO task."""
    label_live = label.replace("/opt", "/pgo").replace("tp6m", "tp6m-live")
    base_ad_query["where"]["and"] = [
        {"in": {"repo.branch.name": ["mozilla-central"]}},
        {"gte": {"action.start_time": {"date": "today-week-week"}}},
        {"in": {"task.run.state": ["completed"]}},
        {"eq": {"run.name": label_live}},
    ]
    return _get_browsertime_results(base_ad_query)
 def _get_similarity(old_videos_info, new_videos_info, output, prefix=""):
    """Calculates a similarity score for two groupings of videos.
    The technique works as follows:
        1. Get the last live site test.
        2. For each 15x15 video pairings, build a cross-correlation matrix:
            1. Get each of the videos and calculate their histograms
               across the full videos.
            2. Calculate the correlation coefficient between these two.
        3. Average the cross-correlation matrix to obtain the score.
    The 2D similarity score is the same, except that it builds a histogram
    from the final frame instead of the full video.
    Args:
        old_videos: List of old videos.
        new_videos: List of new videos (from this task).
        output: Location to output videos with low similarity scores.
        prefix: Prefix a string to the output.
    Returns:
        Two similarity scores (3D, 2D) as a float.
    """
    nhists = []
    nhists2d = []
    old_videos = [entry["data"] for entry in old_videos_info]
    new_videos = [entry["data"] for entry in new_videos_info]
    total_vids = min(len(old_videos), len(new_videos))
    xcorr = np.zeros((total_vids, total_vids))
    xcorr2d = np.zeros((total_vids, total_vids))
    for i in range(total_vids):
        datao = np.asarray(_get_frames(old_videos[i]))
        histo, _, _ = plt.hist(datao.flatten(), bins=255)
        histo2d, _, _ = plt.hist(datao[-1, :, :].flatten(), bins=255)
        for j in range(total_vids):
            if i == 0:
                # Only calculate the histograms once; it takes time
                datan = np.asarray(_get_frames(new_videos[j]))
                histn, _, _ = plt.hist(datan.flatten(), bins=255)
                histn2d, _, _ = plt.hist(datan[-1, :, :].flatten(), bins=255)
                nhists.append(histn)
                nhists2d.append(histn2d)
            else:
                histn = nhists[j]
                histn2d = nhists2d[j]
            rho, _ = spearmanr(histn, histo)
            rho2d, _ = spearmanr(histn2d, histo2d)
            xcorr[i, j] = rho
            xcorr2d[i, j] = rho2d
    similarity = np.mean(xcorr)
    similarity2d = np.mean(xcorr2d)
    log.info("Average 3D similarity: %s" % str(np.round(similarity, 5)))
    log.info("Average 2D similarity: %s" % str(np.round(similarity2d, 5)))
    if np.round(similarity, 1) <= 0.7 or np.round(similarity2d, 1) <= 0.7:
        # For low correlations, output the worst video pairing
        # so that we can visually see what the issue was
        minind = np.unravel_index(np.argmin(xcorr, axis=None), xcorr.shape)
        oldvid = old_videos_info[minind[0]]["path"]
        shutil.copyfile(oldvid, str(pathlib.Path(output, "%sold_video.mp4" % prefix)))
        newvid = new_videos_info[minind[1]]["path"]
        shutil.copyfile(newvid, str(pathlib.Path(output, "%snew_video.mp4" % prefix)))
    return np.round(similarity, 5), np.round(similarity2d, 5)
 def calculate_similarity(jobs_json, fetch_dir, output):
    """Calculates the similarity score for this task.
    Here we use activedata to find the last live site that ran and
    to find the last task (with the same label) that ran. Those two
    tasks are then compared to the current one and 4 metrics are produced.
    For live sites, we only calculate 2 of these metrics, since the
    playback similarity is not applicable to it.
    Args:
        jobs_json: The jobs JSON that holds extra information.
        fetch_dir: The fetch directory that holds the new videos.
        output: The output directory.
    Returns:
        A dictionary containing up to 4 different metrics (their values default
        to None if a metric couldn't be calculated):
            PlaybackSimilarity: Similarity of the full playback to a live site test.
            PlaybackSimilarity2D: - // - (but for the final frame only)
            Similarity: Similarity of the tests video recording to its last run.
            Similarity2D: - // - (but for the final frame only)
    """
    global log
    log = structlog.get_logger()
    label = os.getenv("TC_LABEL", "")
    if not label:
        log.info("TC_LABEL is undefined, cannot calculate similarity metrics")
        return {}
    # Get all the newest videos from this task
    new_btime_videos = [
        {"data": _open_data(str(f)), "path": str(f)}
        for f in pathlib.Path(fetch_dir).rglob("*.mp4")
    ]
    log.info("Found %s new videos" % str(len(new_btime_videos)))
    # Get the similarity against the last task
    old_btime_res = _data_from_last_task(label)
    old_sim = old_sim2d = None
    if old_btime_res:
        old_btime_videos = [
            {"data": _open_data(str(f)), "path": str(f)}
            for f in pathlib.Path(old_btime_res).rglob("*.mp4")
        ]
        log.info("Found %s old videos" % str(len(old_btime_videos)))
        old_sim, old_sim2d = _get_similarity(
            old_btime_videos, new_btime_videos, output
        )
    else:
        log.info("Failed to find an older test task")
    # Compare recordings to their live site variant if it exists
    live_sim = live_sim2d = None
    if "live" not in jobs_json["extra_options"]:
        live_btime_res = _data_from_last_live_task(label)
        if live_btime_res:
            live_btime_videos = [
                {"data": _open_data(str(f)), "path": str(f)}
                for f in pathlib.Path(live_btime_res).rglob("*.mp4")
            ]
            log.info("Found %s live videos" % str(len(live_btime_videos)))
            live_sim, live_sim2d = _get_similarity(
                live_btime_videos, new_btime_videos, output, prefix="live_"
            )
        else:
            log.info("Failed to find a live site variant")
    return {
        "PlaybackSimilarity": live_sim,
        "PlaybackSimilarity2D": live_sim2d,
        "Similarity": old_sim,
        "Similarity2D": old_sim2d,
    }
--- a/taskcluster/fenix_taskgraph/transforms/browsertime.py
+++ b/taskcluster/fenix_taskgraph/transforms/browsertime.py
@ -134,8 +134,8 @@ def build_browsertime_task(config, tasks):
        run_visual_metrics = task.pop("run-visual-metrics", False)
        if run_visual_metrics:
            task["run"]["command"].append("--browsertime-video")
            task["run"]["command"].append("--browsertime-visualmetrics")
            task["run"]["command"].append("--browsertime-no-ffwindowrecorder")
            task["attributes"]["run-visual-metrics"] = True
        # Build taskcluster group and symol
        task["treeherder"]["symbol"] = "Btime(%s)" % symbol
--- a/taskcluster/fenix_taskgraph/transforms/visual_metrics.py
+++ b/taskcluster/fenix_taskgraph/transforms/visual_metrics.py
@ -1,91 +0,0 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 """
 Generate labels for tasks without names, consistently.
 Uses attributes from `primary-dependency`.
 """
 from taskgraph.transforms.base import TransformSequence
 transforms = TransformSequence()
 SYMBOL = "{groupSymbol}({symbol}-vismet)"
 # the test- prefix makes the task SETA-optimized.
 LABEL = "test-vismet-{platform}-{label}"
@transforms.add
 def make_label(config, jobs):
    """Generate a sane label for a new task constructed from a dependency
    Using attributes from the dependent job and the current task kind"""
    for job in jobs:
        dep_job = job["primary-dependency"]
        attr = dep_job.attributes.get
        if attr("locale", job.get("locale")):
            template = "{kind}-{locale}-{build_platform}/{build_type}"
        elif attr("l10n_chunk"):
            template = "{kind}-{build_platform}-{l10n_chunk}/{build_type}"
        elif config.kind.startswith("release-eme-free") or config.kind.startswith(
            "release-partner-repack"
        ):
            suffix = job.get("extra", {}).get("repack_suffix", None) or job.get(
                "extra", {}
            ).get("repack_id", None)
            template = "{kind}-{build_platform}"
            if suffix:
                template += "-{}".format(suffix.replace("/", "-"))
        else:
            template = "{kind}-{build_platform}/{build_type}"
        job["label"] = template.format(
            kind=config.kind,
            build_platform=attr("build_platform"),
            build_type=attr("build_type"),
            locale=attr("locale", job.get("locale", "")),  # Locale can be absent
            l10n_chunk=attr("l10n_chunk", ""),  # Can be empty
        )
        yield job
@transforms.add
 def run_visual_metrics(config, jobs):
    for job in jobs:
        dep_job = job.pop("primary-dependency", None)
        if dep_job is not None:
            platform = dep_job.task["extra"]["treeherder-platform"]
            job["dependencies"] = {dep_job.label: dep_job.label}
            # Add the artifact to be processed as a fetches artifact
            job["fetches"][dep_job.label] = [
                {"artifact": "browsertime-results.tgz", "extract": True}
            ]
            # vismet runs on Linux but we want to have it displayed
            # alongside the job it was triggered by to make it easier for
            # people to find it back.
            job["label"] = LABEL.format(platform=platform, label=dep_job.label)
            treeherder_info = dict(dep_job.task["extra"]["treeherder"])
            job["treeherder"]["platform"] = platform
            job["treeherder"]["symbol"] = SYMBOL.format(
                groupSymbol=treeherder_info["groupSymbol"],
                symbol=treeherder_info["symbol"],
            )
            # Store the platform name so we can use it to calculate
            # the similarity metric against other tasks
            job["worker"].setdefault("env", {})["TC_PLATFORM"] = platform
            # run-on-projects needs to be set based on the dependent task
            attributes = dict(dep_job.attributes)
            job["run-on-projects"] = attributes["run_on_projects"]
            # The run-on-tasks-for also needs to be setup here
            job["run-on-tasks-for"] = attributes.get("run_on_tasks_for", [])
            # We can't use the multi_dep transforms which remove this
            # field, so we remove the dependent-tasks entry here
            del job["dependent-tasks"]
            yield job