From 0e1e9645b05b45a232fb432a728cb3da6a4648d2 Mon Sep 17 00:00:00 2001
From: gmierz <gmierz1@live.ca>
Date: Tue, 21 Jul 2020 11:34:48 -0400
Subject: [PATCH] Update visual-metric code.

---
 .../visual-metrics/run-visual-metrics.py      |  69 +++--
 .../docker/visual-metrics/similarity.py       | 291 ++++++++++++------
 2 files changed, 237 insertions(+), 123 deletions(-)

diff --git a/taskcluster/docker/visual-metrics/run-visual-metrics.py b/taskcluster/docker/visual-metrics/run-visual-metrics.py
index 4ae05172d..14b15221f 100644
--- a/taskcluster/docker/visual-metrics/run-visual-metrics.py
+++ b/taskcluster/docker/visual-metrics/run-visual-metrics.py
@@ -27,12 +27,16 @@ from voluptuous import ALLOW_EXTRA, Required, Schema
 #: The directory where artifacts from this job will be placed.
 OUTPUT_DIR = Path("/", "builds", "worker", "artifacts")
 
+
 #: A job to process through visualmetrics.py
 @attr.s
 class Job:
     #: The name of the test.
     test_name = attr.ib(type=str)
 
+    #: The extra options for this job.
+    extra_options = attr.ib(type=str)
+
     #: json_path: The path to the ``browsertime.json`` file on disk.
     json_path = attr.ib(type=Path)
 
@@ -44,7 +48,11 @@ class Job:
 JOB_SCHEMA = Schema(
     {
         Required("jobs"): [
-            {Required("test_name"): str, Required("browsertime_json_path"): str}
+            {
+                Required("test_name"): str,
+                Required("browsertime_json_path"): str,
+                Required("extra_options"): [str],
+            }
         ],
         Required("application"): {Required("name"): str, "version": str},
         Required("extra_options"): [str],
@@ -80,7 +88,7 @@ def run_command(log, cmd):
         return e.returncode, e.output
 
 
-def append_result(log, suites, test_name, name, result):
+def append_result(log, suites, test_name, name, result, extra_options):
     """Appends a ``name`` metrics result in the ``test_name`` suite.
 
     Args:
@@ -98,10 +106,16 @@ def append_result(log, suites, test_name, name, result):
         log.error("Could not convert value", name=name)
         log.error("%s" % result)
         result = 0
-    if test_name not in suites:
-        suites[test_name] = {"name": test_name, "subtests": {}}
 
-    subtests = suites[test_name]["subtests"]
+    if test_name in suites and suites[test_name]["extraOptions"] != extra_options:
+        missing = set(extra_options) - set(suites[test_name]["extraOptions"])
+        test_name = test_name + "-".join(list(missing))
+
+    subtests = suites.setdefault(
+        test_name,
+        {"name": test_name, "subtests": {}, "extraOptions": extra_options}
+    )["subtests"]
+
     if name not in subtests:
         subtests[name] = {
             "name": name,
@@ -241,6 +255,8 @@ def main(log, args):
                 jobs.append(
                     Job(
                         test_name=job["test_name"],
+                        extra_options=len(job["extra_options"]) > 0 and
+                        job["extra_options"] or jobs_json["extra_options"],
                         json_path=browsertime_json_path,
                         video_path=browsertime_json_path.parent / video,
                     )
@@ -273,45 +289,34 @@ def main(log, args):
                 # Python 3.5 requires a str object (not 3.6+)
                 res = json.loads(res.decode("utf8"))
                 for name, value in res.items():
-                    append_result(log, suites, job.test_name, name, value)
+                    append_result(log, suites, job.test_name, name, value, job.extra_options)
 
     suites = [get_suite(suite) for suite in suites.values()]
 
     perf_data = {
         "framework": {"name": "browsertime"},
         "application": jobs_json["application"],
-        "type": "vismet",
+        "type": "pageload",
         "suites": suites,
     }
-    for entry in suites:
-        entry["extraOptions"] = jobs_json["extra_options"]
 
     # Try to get the similarity for all possible tests, this means that we
     # will also get a comparison of recorded vs. live sites to check
     # the on-going quality of our recordings.
-    similarity = None
-    if "android" in os.getenv("TC_PLATFORM", ""):
-        try:
-            from similarity import calculate_similarity
-            similarity = calculate_similarity(jobs_json, fetch_dir, OUTPUT_DIR, log)
-        except Exception:
-            log.info("Failed to calculate similarity score", exc_info=True)
-
-    if similarity:
-        suites[0]["subtests"].append({
-            "name": "Similarity3D",
-            "value": similarity[0],
-            "replicates": [similarity[0]],
-            "lowerIsBetter": False,
-            "unit": "a.u.",
-        })
-        suites[0]["subtests"].append({
-            "name": "Similarity2D",
-            "value": similarity[1],
-            "replicates": [similarity[1]],
-            "lowerIsBetter": False,
-            "unit": "a.u.",
-        })
+    try:
+        from similarity import calculate_similarity
+        for name, value in calculate_similarity(jobs_json, fetch_dir, OUTPUT_DIR).items():
+            if value is None:
+                continue
+            suites[0]["subtests"].append({
+                "name": name,
+                "value": value,
+                "replicates": [value],
+                "lowerIsBetter": False,
+                "unit": "a.u.",
+            })
+    except Exception:
+        log.info("Failed to calculate similarity score", exc_info=True)
 
     # Validates the perf data complies with perfherder schema.
     # The perfherder schema uses jsonschema so we can't use voluptuous here.
diff --git a/taskcluster/docker/visual-metrics/similarity.py b/taskcluster/docker/visual-metrics/similarity.py
index 5820e531e..f56e15875 100644
--- a/taskcluster/docker/visual-metrics/similarity.py
+++ b/taskcluster/docker/visual-metrics/similarity.py
@@ -10,6 +10,7 @@ import os
 import pathlib
 import shutil
 import socket
+import structlog
 import tarfile
 import tempfile
 import urllib
@@ -19,8 +20,24 @@ from matplotlib import pyplot as plt
 from scipy.stats import spearmanr
 
 
-def open_data(file):
-    return cv2.VideoCapture(str(file))
+log = None
+
+
+# We add the `and` conditions to it later
+base_ad_query = {
+    "from": "task",
+    "limit": 1000,
+    "where": {
+        "and": []
+    },
+    "select": [
+        "action.start_time",
+        "run.name",
+        "task.artifacts",
+        "task.group.id",
+        "task.id"
+    ],
+}
 
 
 def socket_timeout(value=120):
@@ -38,8 +55,12 @@ def socket_timeout(value=120):
     return _socket_timeout
 
 
+def _open_data(file):
+    return cv2.VideoCapture(str(file))
+
+
 @socket_timeout(120)
-def query_activedata(query_json, log):
+def _query_activedata(query_json):
     """Used to run queries on active data."""
     active_data_url = "http://activedata.allizom.org/query"
 
@@ -59,7 +80,7 @@ def query_activedata(query_json, log):
 
 
 @socket_timeout(120)
-def download(url, loc, log):
+def _download(url, loc):
     """Downloads from a url (with a timeout)."""
     log.info("Downloading %s" % url)
     try:
@@ -70,7 +91,7 @@ def download(url, loc, log):
     return True
 
 
-def get_frames(video):
+def _get_frames(video):
     """Gets all frames from a video into a list."""
     allframes = []
     while video.isOpened():
@@ -84,77 +105,11 @@ def get_frames(video):
     return allframes
 
 
-def calculate_similarity(jobs_json, fetch_dir, output, log):
-    """Calculates the similarity score against the last live site test.
-
-    The technique works as follows:
-        1. Get the last live site test.
-        2. For each 15x15 video pairings, build a cross-correlation matrix:
-            1. Get each of the videos and calculate their histograms
-               across the full videos.
-            2. Calculate the correlation coefficient between these two.
-        3. Average the cross-correlation matrix to obtain the score.
-
-    The 2D similarity score is the same, except that it builds a histogram
-    from the final frame instead of the full video.
-
-    For finding the last live site, we use active-data. We search for
-    PGO android builds since this metric is only available for live sites that
-    run on android in mozilla-cental. Given that live sites currently
-    run on cron 3 days a week, then it's also reasonable to look for tasks
-    which have occurred before today and within the last two weeks at most.
-    But this is a TODO for future work, since we need to determine a better
-    way of selecting the last task (HG push logs?) - there's a lot that factors
-    into these choices, so it might require a multi-faceted approach.
-
-    Args:
-        jobs_json: The jobs JSON that holds extra information.
-        fetch_dir: The fetch directory that holds the new videos.
-        log: The logger.
-    Returns:
-        Two similarity scores (3D, 2D) as a float, or None if there was an issue.
-    """
-    app = jobs_json["application"]["name"]
-    test = jobs_json["jobs"][0]["test_name"]
-    splittest = test.split("-cold")
-
-    cold = ""
-    if len(splittest) > 0:
-        cold = ".*cold"
-    test = splittest[0]
-
-    # PGO vs. OPT shouldn't matter much, but we restrict it to PGO builds here
-    # for android, and desktop tests have the opt/pgo restriction removed
-    plat = os.getenv("TC_PLATFORM", "")
-    if "android" in plat:
-        plat = plat.replace("/opt", "/pgo")
-    else:
-        plat = plat.replace("/opt", "").replace("/pgo", "")
-    ad_query = {
-        "from": "task",
-        "limit": 1000,
-        "where": {
-            "and": [
-                {
-                    "regexp": {
-                        "run.name": ".*%s.*browsertime.*-live.*%s%s.*%s.*"
-                        % (plat, app, cold, test)
-                    }
-                },
-                {"not": {"prefix": {"run.name": "test-vismet"}}},
-                {"in": {"repo.branch.name": ["mozilla-central"]}},
-                {"gte": {"action.start_time": {"date": "today-week-week"}}},
-                {"lt": {"action.start_time": {"date": "today-1day"}}},
-                {"in": {"task.run.state": ["completed"]}},
-            ]
-        },
-        "select": ["action.start_time", "run.name", "task.artifacts"],
-    }
-
-    # Run the AD query and find the browsertime videos to download
+def _get_browsertime_results(query):
+    """Used to run an AD query and extract the browsertime results if they exist."""
     failed = False
     try:
-        data = query_activedata(ad_query, log)
+        data = _query_activedata(query)
     except Exception as e:
         log.info(str(e))
         failed = True
@@ -162,6 +117,7 @@ def calculate_similarity(jobs_json, fetch_dir, output, log):
         log.info("Couldn't get activedata data")
         return None
 
+    # Find the newest browsertime task
     log.info("Found %s datums" % str(len(data["action.start_time"])))
     maxind = np.argmax([float(t) for t in data["action.start_time"]])
     artifacts = data["task.artifacts"][maxind]
@@ -171,13 +127,20 @@ def calculate_similarity(jobs_json, fetch_dir, output, log):
             btime_artifact = art["url"]
             break
     if not btime_artifact:
-        log.info("Can't find an older live site")
+        log.info("Can't find an older site test")
         return None
 
+    log.info("Comparing videos to TASK_GROUP=%s, TASK_ID=%s" % (
+        data["task.group.id"][maxind], data["task.id"][maxind]
+    ))
+
     # Download the browsertime videos and untar them
     tmpdir = tempfile.mkdtemp()
     loc = os.path.join(tmpdir, "tmpfile.tgz")
-    if not download(btime_artifact, loc, log):
+    if not _download(btime_artifact, loc):
+        log.info(
+            "Failed to download browsertime-results artifact from %s" % btime_artifact
+        )
         return None
     tmploc = tempfile.mkdtemp()
     try:
@@ -191,22 +154,90 @@ def calculate_similarity(jobs_json, fetch_dir, output, log):
         )
         return None
 
-    # Find all the videos
-    oldmp4s = [str(f) for f in pathlib.Path(tmploc).rglob("*.mp4")]
-    log.info("Found %s old videos" % str(len(oldmp4s)))
-    newmp4s = [str(f) for f in pathlib.Path(fetch_dir).rglob("*.mp4")]
-    log.info("Found %s new videos" % str(len(newmp4s)))
+    return tmploc
 
-    # Finally, calculate the 2D/3D score
+
+def _data_from_last_task(label):
+    """Gets the data from the last PGO/OPT task with the same label.
+
+    We look for both OPT and PGO tasks. The difference
+    between them should be minimal. This method also provides
+    a way to compare recordings from this task to another
+    known task based on the TC_GROUP_ID environment varible.
+    """
+    label_opt = label.replace("/pgo", "/opt")
+    label_pgo = label.replace("/opt", "/pgo")
+
+    base_ad_query["where"]["and"] = [
+        {"in": {"task.run.state": ["completed"]}},
+        {"or": [
+            {"eq": {"run.name": label_pgo}},
+            {"eq": {"run.name": label_opt}}
+        ]}
+    ]
+
+    task_group_id = os.getenv("TC_GROUP_ID", "")
+    if task_group_id:
+        base_ad_query["where"]["and"].append(
+            {"eq": {"task.group.id": task_group_id}}
+        )
+    else:
+        base_ad_query["where"]["and"].extend([
+            {"in": {"repo.branch.name": ["mozilla-central"]}},
+            {"gte": {"action.start_time": {"date": "today-week-week"}}},
+        ])
+
+    return _get_browsertime_results(base_ad_query)
+
+
+def _data_from_last_live_task(label):
+    """Gets the data from the last live site PGO task."""
+    label_live = label.replace("/opt", "/pgo").replace("tp6m", "tp6m-live")
+
+    base_ad_query["where"]["and"] = [
+        {"in": {"repo.branch.name": ["mozilla-central"]}},
+        {"gte": {"action.start_time": {"date": "today-week-week"}}},
+        {"in": {"task.run.state": ["completed"]}},
+        {"eq": {"run.name": label_live}},
+    ]
+
+    return _get_browsertime_results(base_ad_query)
+
+
+def _get_similarity(old_videos_info, new_videos_info, output, prefix=""):
+    """Calculates a similarity score for two groupings of videos.
+
+    The technique works as follows:
+        1. Get the last live site test.
+        2. For each 15x15 video pairings, build a cross-correlation matrix:
+            1. Get each of the videos and calculate their histograms
+               across the full videos.
+            2. Calculate the correlation coefficient between these two.
+        3. Average the cross-correlation matrix to obtain the score.
+
+    The 2D similarity score is the same, except that it builds a histogram
+    from the final frame instead of the full video.
+
+    Args:
+        old_videos: List of old videos.
+        new_videos: List of new videos (from this task).
+        output: Location to output videos with low similarity scores.
+        prefix: Prefix a string to the output.
+    Returns:
+        Two similarity scores (3D, 2D) as a float.
+    """
     nhists = []
     nhists2d = []
 
-    total_vids = min(len(oldmp4s), len(newmp4s))
+    old_videos = [entry["data"] for entry in old_videos_info]
+    new_videos = [entry["data"] for entry in new_videos_info]
+
+    total_vids = min(len(old_videos), len(new_videos))
     xcorr = np.zeros((total_vids, total_vids))
     xcorr2d = np.zeros((total_vids, total_vids))
 
     for i in range(total_vids):
-        datao = np.asarray(get_frames(open_data(oldmp4s[i])))
+        datao = np.asarray(_get_frames(old_videos[i]))
 
         histo, _, _ = plt.hist(datao.flatten(), bins=255)
         histo2d, _, _ = plt.hist(datao[-1, :, :].flatten(), bins=255)
@@ -214,7 +245,7 @@ def calculate_similarity(jobs_json, fetch_dir, output, log):
         for j in range(total_vids):
             if i == 0:
                 # Only calculate the histograms once; it takes time
-                datan = np.asarray(get_frames(open_data(newmp4s[j])))
+                datan = np.asarray(_get_frames(new_videos[j]))
 
                 histn, _, _ = plt.hist(datan.flatten(), bins=255)
                 histn2d, _, _ = plt.hist(datan[-1, :, :].flatten(), bins=255)
@@ -237,15 +268,93 @@ def calculate_similarity(jobs_json, fetch_dir, output, log):
     log.info("Average 3D similarity: %s" % str(np.round(similarity, 5)))
     log.info("Average 2D similarity: %s" % str(np.round(similarity2d, 5)))
 
-    if similarity < 0.5:
-        # For really low correlations, output the worst video pairing
+    if np.round(similarity, 1) <= 0.7 or np.round(similarity2d, 1) <= 0.7:
+        # For low correlations, output the worst video pairing
         # so that we can visually see what the issue was
         minind = np.unravel_index(np.argmin(xcorr, axis=None), xcorr.shape)
 
-        oldvid = oldmp4s[minind[0]]
-        shutil.copyfile(oldvid, str(pathlib.Path(output, "old_video.mp4")))
+        oldvid = old_videos_info[minind[0]]["path"]
+        shutil.copyfile(oldvid, str(pathlib.Path(output, "%sold_video.mp4" % prefix)))
 
-        newvid = newmp4s[minind[1]]
-        shutil.copyfile(newvid, str(pathlib.Path(output, "new_video.mp4")))
+        newvid = new_videos_info[minind[1]]["path"]
+        shutil.copyfile(newvid, str(pathlib.Path(output, "%snew_video.mp4" % prefix)))
 
     return np.round(similarity, 5), np.round(similarity2d, 5)
+
+
+def calculate_similarity(jobs_json, fetch_dir, output):
+    """Calculates the similarity score for this task.
+
+    Here we use activedata to find the last live site that ran and
+    to find the last task (with the same label) that ran. Those two
+    tasks are then compared to the current one and 4 metrics are produced.
+
+    For live sites, we only calculate 2 of these metrics, since the
+    playback similarity is not applicable to it.
+
+    Args:
+        jobs_json: The jobs JSON that holds extra information.
+        fetch_dir: The fetch directory that holds the new videos.
+        output: The output directory.
+    Returns:
+        A dictionary containing up to 4 different metrics (their values default
+        to None if a metric couldn't be calculated):
+            PlaybackSimilarity: Similarity of the full playback to a live site test.
+            PlaybackSimilarity2D: - // - (but for the final frame only)
+            Similarity: Similarity of the tests video recording to its last run.
+            Similarity2D: - // - (but for the final frame only)
+    """
+    global log
+    log = structlog.get_logger()
+
+    label = os.getenv("TC_LABEL", "")
+    if not label:
+        log.info("TC_LABEL is undefined, cannot calculate similarity metrics")
+        return {}
+
+    # Get all the newest videos from this task
+    new_btime_videos = [
+        {"data": _open_data(str(f)), "path": str(f)}
+        for f in pathlib.Path(fetch_dir).rglob("*.mp4")
+    ]
+    log.info("Found %s new videos" % str(len(new_btime_videos)))
+
+    # Get the similarity against the last task
+    old_btime_res = _data_from_last_task(label)
+    old_sim = old_sim2d = None
+    if old_btime_res:
+        old_btime_videos = [
+            {"data": _open_data(str(f)), "path": str(f)}
+            for f in pathlib.Path(old_btime_res).rglob("*.mp4")
+        ]
+        log.info("Found %s old videos" % str(len(old_btime_videos)))
+
+        old_sim, old_sim2d = _get_similarity(
+            old_btime_videos, new_btime_videos, output
+        )
+    else:
+        log.info("Failed to find an older test task")
+
+    # Compare recordings to their live site variant if it exists
+    live_sim = live_sim2d = None
+    if "live" not in jobs_json["extra_options"]:
+        live_btime_res = _data_from_last_live_task(label)
+        if live_btime_res:
+            live_btime_videos = [
+                {"data": _open_data(str(f)), "path": str(f)}
+                for f in pathlib.Path(live_btime_res).rglob("*.mp4")
+            ]
+            log.info("Found %s live videos" % str(len(live_btime_videos)))
+
+            live_sim, live_sim2d = _get_similarity(
+                live_btime_videos, new_btime_videos, output, prefix="live_"
+            )
+        else:
+            log.info("Failed to find a live site variant")
+
+    return {
+        "PlaybackSimilarity": live_sim,
+        "PlaybackSimilarity2D": live_sim2d,
+        "Similarity": old_sim,
+        "Similarity2D": old_sim2d,
+    }