From 3c668ae89d03f4654d6070701bfbcfc50b8cb2cf Mon Sep 17 00:00:00 2001 From: Daniel Lauzon Date: Wed, 1 Jan 2020 17:31:51 -0500 Subject: [PATCH 1/9] Replaced CaptureScreenshot with a DOM Query --- main.go | 88 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 52 insertions(+), 36 deletions(-) diff --git a/main.go b/main.go index 23a59d2..9682a3a 100644 --- a/main.go +++ b/main.go @@ -19,7 +19,6 @@ limitations under the License. package main import ( - "bytes" "context" "errors" "flag" @@ -33,7 +32,6 @@ import ( "strings" "time" - "github.com/chromedp/cdproto/cdp" "github.com/chromedp/cdproto/input" "github.com/chromedp/cdproto/page" "github.com/chromedp/chromedp" @@ -280,54 +278,70 @@ func (s *Session) firstNav(ctx context.Context) error { return nil } -// navToEnd waits for the page to be ready to receive scroll key events, by -// trying to select an item with the right arrow key, and then scrolls down to the -// end of the page, i.e. to the oldest items. +// navToEnd waits for the page to be ready to receive scroll key events, +// and then scrolls down to the end of the page, i.e. to the oldest items. func navToEnd(ctx context.Context) error { - // wait for page to be loaded, i.e. that we can make an element active by using - // the right arrow key. - for { - chromedp.KeyEvent(kb.ArrowRight).Do(ctx) - time.Sleep(tick) - var ids []cdp.NodeID - if err := chromedp.Run(ctx, - chromedp.NodeIDs(`document.activeElement`, &ids, chromedp.ByJSPath)); err != nil { - return err - } - if len(ids) > 0 { - if *verboseFlag { - log.Printf("We are ready, because element %v is selected", ids[0]) - } - break - } - time.Sleep(tick) - } + // wait for page to be loaded + chromedp.WaitReady("body", chromedp.ByQuery).Do(ctx) - // try jumping to the end of the page. detect we are there and have stopped - // moving when two consecutive screenshots are identical. - var previousScr, scr []byte + // Try scrolling to the end of the page. + // After each scroll attempt we extract the last Photo Page Element (href) from the DOM. + // We detect we are at the end when two consecutive DOM extractions are identical. + var previousHref string for { chromedp.KeyEvent(kb.PageDown).Do(ctx) chromedp.KeyEvent(kb.End).Do(ctx) - chromedp.CaptureScreenshot(&scr).Do(ctx) - if previousScr == nil { - previousScr = scr - continue + time.Sleep(tick) // sleep *before* we establish new state in DOM + + // Extract last Photo Page Element (href) from DOM. + lastHrefInDOM, err := lastPhotoInDOM(ctx) + if err != nil { + continue // Just ignore this error, continue will retry. } - if bytes.Equal(previousScr, scr) { + if previousHref == lastHrefInDOM { break } - previousScr = scr - time.Sleep(tick) + previousHref = lastHrefInDOM } - if *verboseFlag { - log.Printf("Successfully jumped to the end") + // Now that we have stopped scrolling, select (focus) on the last element + // The element must be focused, so that navToLast can send "\n" to enter photo detail page + lastEltSel := fmt.Sprintf(`a[href="%s"]`, previousHref) + if err := chromedp.Focus(lastEltSel).Do(ctx); err != nil { + log.Printf("Error focus: %s", lastEltSel) } + if *verboseFlag { + log.Printf("Successfully jumped to the end: %s", previousHref) + } return nil } +// When in the Main/Album Page, the DOM contains elements for all visible images. +// lastPhotoInDOM simply returns the last such href in document order. +// The DOM actually contains more images than those that are visible, in a kind of virtual scrolling window +// In the DOM, but not reflecting exactly the visible photos (actually a superset of the visible elements): +// +// +// +// We tried to find the actual *oldest* photo by using the aria-label attribute which contains a date for the photo, +// unfortunately that label is localised for each user's language which makes the date format very hard to parse. +func lastPhotoInDOM(ctx context.Context) (string, error) { + sel := `a[href^="./photo/"]` // css selector for all links to images with href prefix "./photo/..." + var attrs []map[string]string + if err := chromedp.AttributesAll(sel, &attrs).Do(ctx); err != nil { + log.Printf("lastPhotoInDOM: document.quertSelectorAll:%s error %s", sel, err) + return "", err + } + if len(attrs) == 0 { + return "", fmt.Errorf("lastPhotoInDOM: no elements match") + } + + lastElement := attrs[len(attrs)-1] + href := lastElement["href"] + return href, nil +} + // navToLast sends the "\n" event until we detect that an item is loaded as a // new page. It then sends the right arrow key event until we've reached the very // last item. @@ -377,7 +391,8 @@ func doRun(filePath string) error { // navLeft navigates to the next item to the left func navLeft(ctx context.Context) error { chromedp.KeyEvent(kb.ArrowLeft).Do(ctx) - chromedp.WaitReady("body", chromedp.ByQuery) + // Could wait for the location to change instead of this Sleep. + time.Sleep(200 * time.Millisecond) return nil } @@ -552,6 +567,7 @@ func (s *Session) navN(N int) func(context.Context) error { if N > 0 && n >= N { break } + if err := navLeft(ctx); err != nil { return err } From c4e30db07209791ff3d7694a569eeec43136a702 Mon Sep 17 00:00:00 2001 From: Daniel Lauzon Date: Fri, 3 Jan 2020 17:16:28 -0500 Subject: [PATCH 2/9] Prefer caller to log errors --- main.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/main.go b/main.go index 9682a3a..26c640d 100644 --- a/main.go +++ b/main.go @@ -308,7 +308,7 @@ func navToEnd(ctx context.Context) error { // The element must be focused, so that navToLast can send "\n" to enter photo detail page lastEltSel := fmt.Sprintf(`a[href="%s"]`, previousHref) if err := chromedp.Focus(lastEltSel).Do(ctx); err != nil { - log.Printf("Error focus: %s", lastEltSel) + return err } if *verboseFlag { @@ -330,7 +330,6 @@ func lastPhotoInDOM(ctx context.Context) (string, error) { sel := `a[href^="./photo/"]` // css selector for all links to images with href prefix "./photo/..." var attrs []map[string]string if err := chromedp.AttributesAll(sel, &attrs).Do(ctx); err != nil { - log.Printf("lastPhotoInDOM: document.quertSelectorAll:%s error %s", sel, err) return "", err } if len(attrs) == 0 { From 59f6603fc64802f271153362892a34bc8a564233 Mon Sep 17 00:00:00 2001 From: Daniel Lauzon Date: Thu, 9 Jan 2020 20:36:48 -0500 Subject: [PATCH 3/9] restore navLeft, removing arbitrary time.Sleep --- main.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/main.go b/main.go index 26c640d..429ea2a 100644 --- a/main.go +++ b/main.go @@ -390,8 +390,7 @@ func doRun(filePath string) error { // navLeft navigates to the next item to the left func navLeft(ctx context.Context) error { chromedp.KeyEvent(kb.ArrowLeft).Do(ctx) - // Could wait for the location to change instead of this Sleep. - time.Sleep(200 * time.Millisecond) + chromedp.WaitReady("body", chromedp.ByQuery) return nil } From a7f9a45d3523e4406c00a024da7dcf6dc79c3e7c Mon Sep 17 00:00:00 2001 From: Daniel Lauzon Date: Sat, 11 Jan 2020 11:20:59 -0500 Subject: [PATCH 4/9] Remove all the css stuff, simplify with sigle loop over document.activeElement.href --- main.go | 77 ++++++++++++++++++++------------------------------------- 1 file changed, 27 insertions(+), 50 deletions(-) diff --git a/main.go b/main.go index 429ea2a..d28f64e 100644 --- a/main.go +++ b/main.go @@ -278,67 +278,44 @@ func (s *Session) firstNav(ctx context.Context) error { return nil } -// navToEnd waits for the page to be ready to receive scroll key events, -// and then scrolls down to the end of the page, i.e. to the oldest items. +// navToEnd selects the last item in the page +// by repeatedly advances the selected item with +// - kb.ArrowRight (which causes an initial selection, and/or advances it by one) +// - kb.End which scrolls to the end of the page, and advances the selected item. +// Note timing is important, because when the kb.End causes significant scrolling, +// the active element become undefined for a certain time, in that case, we get an error (ignore), sleep, and retry. +// The termnation criteria is that the selected item (document.activeElement.href) is stable for >2 iterations func navToEnd(ctx context.Context) error { - // wait for page to be loaded - chromedp.WaitReady("body", chromedp.ByQuery).Do(ctx) - - // Try scrolling to the end of the page. - // After each scroll attempt we extract the last Photo Page Element (href) from the DOM. - // We detect we are at the end when two consecutive DOM extractions are identical. - var previousHref string + var prev, active string + lastRepeated := 0 for { - chromedp.KeyEvent(kb.PageDown).Do(ctx) + chromedp.KeyEvent(kb.ArrowRight).Do(ctx) chromedp.KeyEvent(kb.End).Do(ctx) - time.Sleep(tick) // sleep *before* we establish new state in DOM + time.Sleep(tick) - // Extract last Photo Page Element (href) from DOM. - lastHrefInDOM, err := lastPhotoInDOM(ctx) - if err != nil { - continue // Just ignore this error, continue will retry. + if err := chromedp.Evaluate(`document.activeElement.href`, &active).Do(ctx); err != nil { + time.Sleep(tick) // this extra sleep is important: after the kb.End, it sometimes takes a while for the active element to be reset + continue // ignore this error: no active element, or active element has no href + } + if active == prev { + lastRepeated++ + } else { + lastRepeated = 0 } - if previousHref == lastHrefInDOM { + if *verboseFlag { + log.Printf("** navToEnd:activeElt %s %d", active, lastRepeated) + } + if lastRepeated > 2 { break } - previousHref = lastHrefInDOM - } - - // Now that we have stopped scrolling, select (focus) on the last element - // The element must be focused, so that navToLast can send "\n" to enter photo detail page - lastEltSel := fmt.Sprintf(`a[href="%s"]`, previousHref) - if err := chromedp.Focus(lastEltSel).Do(ctx); err != nil { - return err + prev = active + // time.Sleep(tick) } - if *verboseFlag { - log.Printf("Successfully jumped to the end: %s", previousHref) + log.Printf("Successfully jumped to the end: %s", active) } - return nil -} -// When in the Main/Album Page, the DOM contains elements for all visible images. -// lastPhotoInDOM simply returns the last such href in document order. -// The DOM actually contains more images than those that are visible, in a kind of virtual scrolling window -// In the DOM, but not reflecting exactly the visible photos (actually a superset of the visible elements): -// -// -// -// We tried to find the actual *oldest* photo by using the aria-label attribute which contains a date for the photo, -// unfortunately that label is localised for each user's language which makes the date format very hard to parse. -func lastPhotoInDOM(ctx context.Context) (string, error) { - sel := `a[href^="./photo/"]` // css selector for all links to images with href prefix "./photo/..." - var attrs []map[string]string - if err := chromedp.AttributesAll(sel, &attrs).Do(ctx); err != nil { - return "", err - } - if len(attrs) == 0 { - return "", fmt.Errorf("lastPhotoInDOM: no elements match") - } - - lastElement := attrs[len(attrs)-1] - href := lastElement["href"] - return href, nil + return nil } // navToLast sends the "\n" event until we detect that an item is loaded as a From 07de9454055f8bb081232ec1049b814b5a809843 Mon Sep 17 00:00:00 2001 From: Daniel Lauzon Date: Sun, 12 Jan 2020 19:40:02 -0500 Subject: [PATCH 5/9] Reformat comments --- main.go | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/main.go b/main.go index d28f64e..98021c6 100644 --- a/main.go +++ b/main.go @@ -283,8 +283,10 @@ func (s *Session) firstNav(ctx context.Context) error { // - kb.ArrowRight (which causes an initial selection, and/or advances it by one) // - kb.End which scrolls to the end of the page, and advances the selected item. // Note timing is important, because when the kb.End causes significant scrolling, -// the active element become undefined for a certain time, in that case, we get an error (ignore), sleep, and retry. -// The termnation criteria is that the selected item (document.activeElement.href) is stable for >2 iterations +// the active element become undefined for a certain time, in that case, we +// get an error (ignore), sleep, and retry. +// The termnation criteria is that the selected item (document.activeElement.href) +// is stable for >2 iterations func navToEnd(ctx context.Context) error { var prev, active string lastRepeated := 0 @@ -294,8 +296,12 @@ func navToEnd(ctx context.Context) error { time.Sleep(tick) if err := chromedp.Evaluate(`document.activeElement.href`, &active).Do(ctx); err != nil { - time.Sleep(tick) // this extra sleep is important: after the kb.End, it sometimes takes a while for the active element to be reset - continue // ignore this error: no active element, or active element has no href + // This extra sleep is important: after the kb.End, + // it sometimes takes a while for the scrolled page to be in a state + // which allows the next kb.ArrowRight to take effect and actually select + // the next element at the new scroll position. + time.Sleep(tick) + continue // ignore this error: no active element, or active element has no href } if active == prev { lastRepeated++ @@ -309,7 +315,6 @@ func navToEnd(ctx context.Context) error { break } prev = active - // time.Sleep(tick) } if *verboseFlag { log.Printf("Successfully jumped to the end: %s", active) From 0369c1fe008cf578a2bf3da4a0ff6f02d43c6261 Mon Sep 17 00:00:00 2001 From: Daniel Lauzon Date: Sun, 12 Jan 2020 19:44:08 -0500 Subject: [PATCH 6/9] full sentences, not telegraphic style --- main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.go b/main.go index 98021c6..668f950 100644 --- a/main.go +++ b/main.go @@ -309,7 +309,7 @@ func navToEnd(ctx context.Context) error { lastRepeated = 0 } if *verboseFlag { - log.Printf("** navToEnd:activeElt %s %d", active, lastRepeated) + log.Printf("Active element %s was seen %d times", active, lastRepeated+1) } if lastRepeated > 2 { break From 2b17b21027f4cbb4a05f71505b2eddd3fa98c140 Mon Sep 17 00:00:00 2001 From: Daniel Lauzon Date: Fri, 17 Jan 2020 12:31:56 -0500 Subject: [PATCH 7/9] Folded navToLast into navToEnd, also added check to navLeft to avoid early termination --- main.go | 78 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/main.go b/main.go index 668f950..17df7ad 100644 --- a/main.go +++ b/main.go @@ -271,15 +271,11 @@ func (s *Session) firstNav(ctx context.Context) error { return err } - if err := navToLast(ctx); err != nil { - return err - } - return nil } // navToEnd selects the last item in the page -// by repeatedly advances the selected item with +// by repeatedly advancing the selected item with // - kb.ArrowRight (which causes an initial selection, and/or advances it by one) // - kb.End which scrolls to the end of the page, and advances the selected item. // Note timing is important, because when the kb.End causes significant scrolling, @@ -320,38 +316,16 @@ func navToEnd(ctx context.Context) error { log.Printf("Successfully jumped to the end: %s", active) } - return nil -} + chromedp.KeyEvent("\n").Do(ctx) + time.Sleep(tick) + var location string + if err := chromedp.Location(&location).Do(ctx); err != nil { + return err + } -// navToLast sends the "\n" event until we detect that an item is loaded as a -// new page. It then sends the right arrow key event until we've reached the very -// last item. -func navToLast(ctx context.Context) error { - var location, prevLocation string - ready := false - for { - chromedp.KeyEvent(kb.ArrowRight).Do(ctx) - time.Sleep(tick) - if !ready { - chromedp.KeyEvent("\n").Do(ctx) - time.Sleep(tick) - } - if err := chromedp.Location(&location).Do(ctx); err != nil { - return err - } - if !ready { - if location != "https://photos.google.com/" { - ready = true - log.Printf("Nav to the end sequence is started because location is %v", location) - } - continue - } + log.Printf("Entered Detail Page: %s", location) + time.Sleep(tick) - if location == prevLocation { - break - } - prevLocation = location - } return nil } @@ -370,9 +344,35 @@ func doRun(filePath string) error { } // navLeft navigates to the next item to the left +// After the navigation sequence (ArrowLeft) is sent, wait for a Location change. +// If however, after 100ms (maxIterations*miniTick), no change is seen, +// then return (we have reached the end) +// navLeft almost always exits after a single iteration, +// but without waiting for the location change, we often see +// a failure to navigate, especially on the first invocation, +// which causes a (false) early termination of the main navN loop func navLeft(ctx context.Context) error { + var prevLocation string + if err := chromedp.Location(&prevLocation).Do(ctx); err != nil { + return err + } + chromedp.KeyEvent(kb.ArrowLeft).Do(ctx) - chromedp.WaitReady("body", chromedp.ByQuery) + + maxIterations := 10 + miniTick := 10 * time.Millisecond + var location string + for i := 0; i < maxIterations; i++ { + + if err := chromedp.Location(&location).Do(ctx); err != nil { + return err + } + if location != prevLocation { + log.Printf("navLeft break at it:%d", i) + break + } + time.Sleep(miniTick) + } return nil } @@ -533,6 +533,9 @@ func (s *Session) navN(N int) func(context.Context) error { return err } if location == prevLocation { + if *verboseFlag { + log.Printf("Terminating because we stopped advancing: %s", prevLocation) + } break } prevLocation = location @@ -545,6 +548,9 @@ func (s *Session) navN(N int) func(context.Context) error { } n++ if N > 0 && n >= N { + if *verboseFlag { + log.Printf("Terminating because desired number of items (%d) was reached", n) + } break } From c224e2fe0155a4190d38ecc29c8046a558885d2f Mon Sep 17 00:00:00 2001 From: Daniel Lauzon Date: Fri, 17 Jan 2020 15:10:54 -0500 Subject: [PATCH 8/9] Move navToEnd logging line --- main.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/main.go b/main.go index 17df7ad..bb88e6f 100644 --- a/main.go +++ b/main.go @@ -312,9 +312,6 @@ func navToEnd(ctx context.Context) error { } prev = active } - if *verboseFlag { - log.Printf("Successfully jumped to the end: %s", active) - } chromedp.KeyEvent("\n").Do(ctx) time.Sleep(tick) @@ -323,8 +320,11 @@ func navToEnd(ctx context.Context) error { return err } - log.Printf("Entered Detail Page: %s", location) - time.Sleep(tick) + if active == location { + if *verboseFlag { + log.Printf("Successfully jumped to the end: %s", location) + } + } return nil } From e4a0f89df933fa1f7af13f5000a7d61ae08f2205 Mon Sep 17 00:00:00 2001 From: Daniel Lauzon Date: Mon, 20 Jan 2020 21:42:02 -0500 Subject: [PATCH 9/9] reverse navLeft modifiacations --- main.go | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/main.go b/main.go index bb88e6f..45a0b99 100644 --- a/main.go +++ b/main.go @@ -344,35 +344,9 @@ func doRun(filePath string) error { } // navLeft navigates to the next item to the left -// After the navigation sequence (ArrowLeft) is sent, wait for a Location change. -// If however, after 100ms (maxIterations*miniTick), no change is seen, -// then return (we have reached the end) -// navLeft almost always exits after a single iteration, -// but without waiting for the location change, we often see -// a failure to navigate, especially on the first invocation, -// which causes a (false) early termination of the main navN loop func navLeft(ctx context.Context) error { - var prevLocation string - if err := chromedp.Location(&prevLocation).Do(ctx); err != nil { - return err - } - chromedp.KeyEvent(kb.ArrowLeft).Do(ctx) - - maxIterations := 10 - miniTick := 10 * time.Millisecond - var location string - for i := 0; i < maxIterations; i++ { - - if err := chromedp.Location(&location).Do(ctx); err != nil { - return err - } - if location != prevLocation { - log.Printf("navLeft break at it:%d", i) - break - } - time.Sleep(miniTick) - } + chromedp.WaitReady("body", chromedp.ByQuery) return nil }