From 359f27abc17619a7a96362e1fb4de736ac6501d7 Mon Sep 17 00:00:00 2001 From: Michael Comella Date: Wed, 17 Feb 2021 20:13:45 -0800 Subject: [PATCH] For #17972: split frameworkStart into two metrics. We do this in order to make it easier to analyze in GLAM: see the metric descriptions for more details. Additionally, we change the time unit to milliseconds to make it easier to analyze in GLAM. --- app/metrics.yaml | 40 +++++++-- app/pings.yaml | 1 + .../perf/StartupFrameworkStartMeasurement.kt | 20 +++-- .../StartupFrameworkStartMeasurementTest.kt | 87 ++++++++++++++++--- docs/metrics.md | 18 ++-- 5 files changed, 136 insertions(+), 30 deletions(-) diff --git a/app/metrics.yaml b/app/metrics.yaml index ee9289f5d..53e09d98e 100644 --- a/app/metrics.yaml +++ b/app/metrics.yaml @@ -3878,14 +3878,21 @@ addons: expires: "2021-04-01" startup.timeline: - framework_start: + framework_primary: send_in_pings: - startup-timeline type: timespan - time_unit: nanosecond + time_unit: millisecond description: | The duration the Android framework takes to start before letting us run - code in `*Application.init`. This is calculated from `appInitTimestamp - + code in `*Application.init` when this device has `clock_ticks_per_second` + equal to 100: if it's not equal to 100, then this value is captured in + `framework_secondary`. We split this into two metrics to make it easier + to analyze in GLAM. We split on 100 because when we did our initial brief + analysis - https://sql.telemetry.mozilla.org/queries/75591 - the results + for clocks ticks were overwhelmingly 100. + + The duration is calculated from `appInitTimestamp - processStartTimestamp`. `processStartTimestamp` is derived from the clock tick time unit, which is expected to be less granular than nanoseconds. Therefore, we convert and round our timestamps to clock ticks before @@ -3895,9 +3902,30 @@ startup.timeline: devices, is also reported as a metric bugs: - https://github.com/mozilla-mobile/fenix/issues/8803 + - https://github.com/mozilla-mobile/fenix/issues/17972 data_reviews: - https://github.com/mozilla-mobile/fenix/pull/9788#pullrequestreview-394228626 - https://github.com/mozilla-mobile/fenix/pull/15713#issuecomment-703972068 + - https://github.com/mozilla-mobile/fenix/pull/18043#issue-575389284 + data_sensitivity: + - technical + notification_emails: + - perf-android-fe@mozilla.com + - mcomella@mozilla.com + expires: "2021-08-01" + framework_secondary: + send_in_pings: + - startup-timeline + type: timespan + time_unit: millisecond + description: | + The duration the Android framework takes to start before letting us run + code in `*Application.init` when this device has `clock_ticks_per_second` + not equal to 100. For more details on this metric, see `framework_primary` + bugs: + - https://github.com/mozilla-mobile/fenix/issues/17972 + data_reviews: + - https://github.com/mozilla-mobile/fenix/pull/18043#issue-575389284 data_sensitivity: - technical notification_emails: @@ -3909,9 +3937,9 @@ startup.timeline: - startup-timeline type: boolean description: | - An error when attempting to record `framework_start` - the application - init timestamp returned a negative value - which is likely indicative of a - bug in the implementation. + An error when attempting to record `framework_primary/secondary` - the + application init timestamp returned a negative value - which is likely + indicative of a bug in the implementation. bugs: - https://github.com/mozilla-mobile/fenix/issues/8803 data_reviews: diff --git a/app/pings.yaml b/app/pings.yaml index 1a8502827..bd726ff12 100644 --- a/app/pings.yaml +++ b/app/pings.yaml @@ -47,6 +47,7 @@ startup-timeline: - https://github.com/mozilla-mobile/fenix/issues/17972 data_reviews: - https://github.com/mozilla-mobile/fenix/pull/9788#pullrequestreview-394228626 + - https://github.com/mozilla-mobile/fenix/pull/18043#issue-575389284 notification_emails: - perf-android-fe@mozilla.com - mcomella@mozilla.com diff --git a/app/src/main/java/org/mozilla/fenix/perf/StartupFrameworkStartMeasurement.kt b/app/src/main/java/org/mozilla/fenix/perf/StartupFrameworkStartMeasurement.kt index dbc4d2c24..47953d9b7 100644 --- a/app/src/main/java/org/mozilla/fenix/perf/StartupFrameworkStartMeasurement.kt +++ b/app/src/main/java/org/mozilla/fenix/perf/StartupFrameworkStartMeasurement.kt @@ -54,20 +54,30 @@ internal class StartupFrameworkStartMeasurement( if (applicationInitNanos < 0) { telemetry.frameworkStartError.set(true) } else { + val clockTicksPerSecond = stat.clockTicksPerSecond.also { + // framework* is derived from the number of clock ticks per second. To ensure this + // value does not throw off our result, we capture it too. + telemetry.clockTicksPerSecond.add(it.toInt()) + } + + // In our brief analysis, clock ticks per second was overwhelmingly equal to 100. To make + // analysis easier in GLAM, we split the results into two separate metrics. See the + // metric descriptions for more details. + @Suppress("MagicNumber") // it's more confusing to separate the comment above from the value declaration. + val durationMetric = + if (clockTicksPerSecond == 100L) telemetry.frameworkPrimary else telemetry.frameworkSecondary + try { - telemetry.frameworkStart.setRawNanos(getFrameworkStartNanos()) + durationMetric.setRawNanos(getFrameworkStartNanos()) } catch (e: FileNotFoundException) { // Privacy managers can add hooks that block access to reading system /proc files. // We want to catch these exception and report an error on accessing the file // rather than an implementation error. telemetry.frameworkStartReadError.set(true) } - - // frameworkStart is derived from the number of clock ticks per second. To ensure this - // value does not throw off our result, we capture it too. - telemetry.clockTicksPerSecond.add(stat.clockTicksPerSecond.toInt()) } } + /** * @throws [java.io.FileNotFoundException] */ diff --git a/app/src/test/java/org/mozilla/fenix/perf/StartupFrameworkStartMeasurementTest.kt b/app/src/test/java/org/mozilla/fenix/perf/StartupFrameworkStartMeasurementTest.kt index 7fadf1f28..fa465b2fb 100644 --- a/app/src/test/java/org/mozilla/fenix/perf/StartupFrameworkStartMeasurementTest.kt +++ b/app/src/test/java/org/mozilla/fenix/perf/StartupFrameworkStartMeasurementTest.kt @@ -17,7 +17,8 @@ import org.junit.Before import org.junit.Test import org.mozilla.fenix.GleanMetrics.StartupTimeline as Telemetry -private const val CLOCK_TICKS_PER_SECOND = 100L +private const val PRIMARY_TICKS = 100L +private const val SECONDARY_TICKS = 50L class StartupFrameworkStartMeasurementTest { @@ -27,10 +28,13 @@ class StartupFrameworkStartMeasurementTest { // We'd prefer to use the Glean test methods over these mocks but they require us to add // Robolectric and it's not worth the impact on test duration. @MockK private lateinit var telemetry: Telemetry - @MockK(relaxed = true) private lateinit var frameworkStart: TimespanMetricType + @MockK(relaxed = true) private lateinit var frameworkPrimary: TimespanMetricType + @MockK(relaxed = true) private lateinit var frameworkSecondary: TimespanMetricType @MockK(relaxed = true) private lateinit var frameworkStartError: BooleanMetricType @MockK(relaxed = true) private lateinit var clockTicksPerSecond: CounterMetricType + private var clockTicksPerSecondValue = -1L + private var elapsedRealtimeNanos = -1L private var processStartTimeTicks = -1L @@ -41,13 +45,17 @@ class StartupFrameworkStartMeasurementTest { elapsedRealtimeNanos = -1 processStartTimeTicks = -1 + // This value is hard-coded in the OS so we default to it as it's the expected value. + clockTicksPerSecondValue = PRIMARY_TICKS + stat = spyk(object : Stat() { - override val clockTicksPerSecond: Long get() = CLOCK_TICKS_PER_SECOND + override val clockTicksPerSecond: Long get() = clockTicksPerSecondValue }) every { stat.getProcessStartTimeTicks(any()) } answers { processStartTimeTicks } val getElapsedRealtimeNanos = { elapsedRealtimeNanos } - every { telemetry.frameworkStart } returns frameworkStart + every { telemetry.frameworkPrimary } returns frameworkPrimary + every { telemetry.frameworkSecondary } returns frameworkSecondary every { telemetry.frameworkStartError } returns frameworkStartError every { telemetry.clockTicksPerSecond } returns clockTicksPerSecond @@ -67,13 +75,32 @@ class StartupFrameworkStartMeasurementTest { } @Test - fun `GIVEN app init is set to valid values WHEN metrics are set THEN frameworkStart is set with the correct value`() { + fun `GIVEN app init is set to valid values and clock ticks per second is 100 WHEN metrics are set THEN frameworkPrimary is set with the correct value`() { + clockTicksPerSecondValue = PRIMARY_TICKS + setProcessAppInitAndMetrics(processStart = 166_636_813, appInit = 1_845_312_345_673_925) + verifyFrameworkStartSuccess(178_944_220_000_000, isPrimary = true) // calculated by hand. + } + + @Test + fun `GIVEN app init is set to valid values and clock ticks per second is not 100 WHEN metrics are set THEN frameworkSecondary is set with the correct value`() { + clockTicksPerSecondValue = SECONDARY_TICKS setProcessAppInitAndMetrics(processStart = 166_636_813, appInit = 1_845_312_345_673_925) - verifyFrameworkStartSuccess(178_944_220_000_000) // calculated by hand. + verifyFrameworkStartSuccess(178_944_220_000_000, isPrimary = false) // calculated by hand. } @Test // this overlaps with the success case test. - fun `GIVEN app init has valid values WHEN onAppInit is called twice and metrics are set THEN frameworkStart uses the first app init value`() { + fun `GIVEN app init has valid values and clock ticks per second is 100 WHEN onAppInit is called twice and metrics are set THEN frameworkPrimary uses the first app init value`() { + clockTicksPerSecondValue = PRIMARY_TICKS + testAppInitCalledTwice(isPrimary = true) + } + + @Test // this overlaps with the success case test. + fun `GIVEN app init has valid values and clock ticks per second is not 100 WHEN onAppInit is called twice and metrics are set THEN frameworkSecondary uses the first app init value`() { + clockTicksPerSecondValue = SECONDARY_TICKS + testAppInitCalledTwice(isPrimary = false) + } + + private fun testAppInitCalledTwice(isPrimary: Boolean) { processStartTimeTicks = 166_636_813 elapsedRealtimeNanos = 1_845_312_345_673_925 @@ -82,14 +109,28 @@ class StartupFrameworkStartMeasurementTest { metrics.onApplicationInit() metrics.setExpensiveMetric() - verifyFrameworkStartSuccess(178_944_220_000_000) // calculated by hand. + verifyFrameworkStartSuccess(178_944_220_000_000, isPrimary) // calculated by hand. } @Test - fun `GIVEN app init have valid values WHEN metrics are set twice THEN frameworkStart is only set once`() { + fun `GIVEN app init have valid values and clock ticks per second is 100 WHEN metrics are set twice THEN frameworkPrimary is only set once`() { + clockTicksPerSecondValue = PRIMARY_TICKS + testMetricsSetTwice(isPrimary = true) + } + + @Test + fun `GIVEN app init have valid values and clock ticks per second is not 100 WHEN metrics are set twice THEN frameworkSecondary is only set once`() { + clockTicksPerSecondValue = SECONDARY_TICKS + testMetricsSetTwice(isPrimary = false) + } + + private fun testMetricsSetTwice(isPrimary: Boolean) { setProcessAppInitAndMetrics(10, 100) metrics.setExpensiveMetric() - verify(exactly = 1) { frameworkStart.setRawNanos(any()) } + + val (setMetric, unsetMetric) = getSetAndUnsetMetric(isPrimary) + verify(exactly = 1) { setMetric.setRawNanos(any()) } + verify { unsetMetric wasNot Called } verify(exactly = 1) { clockTicksPerSecond.add(any()) } verify { frameworkStartError wasNot Called } } @@ -103,15 +144,33 @@ class StartupFrameworkStartMeasurementTest { metrics.setExpensiveMetric() } - private fun verifyFrameworkStartSuccess(nanos: Long) { - verify { frameworkStart.setRawNanos(nanos) } - verify { clockTicksPerSecond.add(CLOCK_TICKS_PER_SECOND.toInt()) } + private fun verifyFrameworkStartSuccess(nanos: Long, isPrimary: Boolean) { + val (setMetric, unsetMetric) = getSetAndUnsetMetric(isPrimary) + verify { setMetric.setRawNanos(nanos) } + verify { unsetMetric wasNot Called } + + val expectedClockTicksPerSecond = getExpectedClockTicksPerSecond(isPrimary) + verify { clockTicksPerSecond.add(expectedClockTicksPerSecond.toInt()) } verify { frameworkStartError wasNot Called } } private fun verifyFrameworkStartError() { verify { frameworkStartError.set(true) } - verify { frameworkStart wasNot Called } + verify { frameworkPrimary wasNot Called } + verify { frameworkSecondary wasNot Called } verify { clockTicksPerSecond wasNot Called } } + + private fun getSetAndUnsetMetric(isPrimary: Boolean): Pair { + return if (isPrimary) { + Pair(frameworkPrimary, frameworkSecondary) + } else { + Pair(frameworkSecondary, frameworkPrimary) + } + } + + // This hard-codes some data that's passed into the test but I don't want to spend more time + // so I don't bother cleaning it up now. + private fun getExpectedClockTicksPerSecond(isPrimary: Boolean): Long = + if (isPrimary) PRIMARY_TICKS else SECONDARY_TICKS } diff --git a/docs/metrics.md b/docs/metrics.md index a3026ee72..4412b58ab 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -346,26 +346,34 @@ The following metrics are added to the ping: This ping is intended to provide an understanding of startup performance. -The ping is intended to be captured by performance testing automation to -report results there, in addition to user telemetry. We place these metrics -into their own ping in order to isolate them and make this process easier. +In addition to being captured on real devices, the ping data was prematurely +optimized into this separate ping to be isolated from other metrics to be +more easily captured by performance testing automation but that hasn't +happened in practice. We would have removed it but implementation +details don't make that possible: +https://github.com/mozilla-mobile/fenix/issues/17972#issuecomment-781002987 +This ping includes the [client id](https://mozilla.github.io/glean/book/user/pings/index.html#the-client_info-section). + **Data reviews for this ping:** - +- **Bugs related to this ping:** - +- The following metrics are added to the ping: | Name | Type | Description | Data reviews | Extras | Expiration | [Data Sensitivity](https://wiki.mozilla.org/Firefox/Data_Collection) | | --- | --- | --- | --- | --- | --- | --- | | startup.timeline.clock_ticks_per_second |[counter](https://mozilla.github.io/glean/book/user/metrics/counter.html) |The number of clock tick time units that occur in one second on this particular device. This value is expected to be used in conjunction with the `framework_start` metric. |[1](https://github.com/mozilla-mobile/fenix/pull/9788#pullrequestreview-394228626), [2](https://github.com/mozilla-mobile/fenix/pull/15713#issuecomment-703972068)||2021-08-01 |1 | -| startup.timeline.framework_start |[timespan](https://mozilla.github.io/glean/book/user/metrics/timespan.html) |The duration the Android framework takes to start before letting us run code in `*Application.init`. This is calculated from `appInitTimestamp - processStartTimestamp`. `processStartTimestamp` is derived from the clock tick time unit, which is expected to be less granular than nanoseconds. Therefore, we convert and round our timestamps to clock ticks before computing the difference and convert back to nanoseconds to report. For debugging purposes, `clock_ticks_per_second`, which may vary between devices, is also reported as a metric |[1](https://github.com/mozilla-mobile/fenix/pull/9788#pullrequestreview-394228626), [2](https://github.com/mozilla-mobile/fenix/pull/15713#issuecomment-703972068)||2021-08-01 |1 | -| startup.timeline.framework_start_error |[boolean](https://mozilla.github.io/glean/book/user/metrics/boolean.html) |An error when attempting to record `framework_start` - the application init timestamp returned a negative value - which is likely indicative of a bug in the implementation. |[1](https://github.com/mozilla-mobile/fenix/pull/9788#pullrequestreview-394228626), [2](https://github.com/mozilla-mobile/fenix/pull/15713#issuecomment-703972068)||2021-08-01 |1 | +| startup.timeline.framework_primary |[timespan](https://mozilla.github.io/glean/book/user/metrics/timespan.html) |The duration the Android framework takes to start before letting us run code in `*Application.init` when this device has `clock_ticks_per_second` equal to 100: if it's not equal to 100, then this value is captured in `framework_secondary`. We split this into two metrics to make it easier to analyze in GLAM. We split on 100 because when we did our initial brief analysis - https://sql.telemetry.mozilla.org/queries/75591 - the results for clocks ticks were overwhelmingly 100. The duration is calculated from `appInitTimestamp - processStartTimestamp`. `processStartTimestamp` is derived from the clock tick time unit, which is expected to be less granular than nanoseconds. Therefore, we convert and round our timestamps to clock ticks before computing the difference and convert back to nanoseconds to report. For debugging purposes, `clock_ticks_per_second`, which may vary between devices, is also reported as a metric |[1](https://github.com/mozilla-mobile/fenix/pull/9788#pullrequestreview-394228626), [2](https://github.com/mozilla-mobile/fenix/pull/15713#issuecomment-703972068), [3](https://github.com/mozilla-mobile/fenix/pull/18043#issue-575389284)||2021-08-01 |1 | +| startup.timeline.framework_secondary |[timespan](https://mozilla.github.io/glean/book/user/metrics/timespan.html) |The duration the Android framework takes to start before letting us run code in `*Application.init` when this device has `clock_ticks_per_second` not equal to 100. For more details on this metric, see `framework_primary` |[1](https://github.com/mozilla-mobile/fenix/pull/18043#issue-575389284)||2021-08-01 |1 | +| startup.timeline.framework_start_error |[boolean](https://mozilla.github.io/glean/book/user/metrics/boolean.html) |An error when attempting to record `framework_primary/secondary` - the application init timestamp returned a negative value - which is likely indicative of a bug in the implementation. |[1](https://github.com/mozilla-mobile/fenix/pull/9788#pullrequestreview-394228626), [2](https://github.com/mozilla-mobile/fenix/pull/15713#issuecomment-703972068)||2021-08-01 |1 | | startup.timeline.framework_start_read_error |[boolean](https://mozilla.github.io/glean/book/user/metrics/boolean.html) |An error when attempting to read stats from /proc pseudo-filesystem - privacy managers can block access to reading these files - the application will catch a file reading exception. |[1](https://github.com/mozilla-mobile/fenix/pull/10481), [2](https://github.com/mozilla-mobile/fenix/pull/15713#issuecomment-703972068)||2021-08-01 |1 |