From a8b1e3699b3d7f3873df185e0449d81b9b0eb265 Mon Sep 17 00:00:00 2001 From: WGH Date: Wed, 27 Mar 2024 19:00:24 +0300 Subject: [PATCH] Fix more cases of pages redirecting to themselves This was "fixed" in b4ca6a7a17 (#763), but the fix turned out to be incomplete. That fix only allowed redirects leading to the same URL as the original destination, and didn't take into account more complicated cases. Such as, for example: * www.example.com * example.com * (set cookie) * example.com --- colly.go | 24 +++++++++++++++++++----- colly_test.go | 24 ++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/colly.go b/colly.go index 451b6766..9f2d6b85 100644 --- a/colly.go +++ b/colly.go @@ -1343,12 +1343,26 @@ func (c *Collector) checkRedirectFunc() func(req *http.Request, via []*http.Requ return fmt.Errorf("Not following redirect to %q: %w", req.URL, err) } - // allow redirects to the original destination - // to support websites redirecting to the same page while setting - // session cookies - samePageRedirect := normalizeURL(req.URL.String()) == normalizeURL(via[0].URL.String()) + // Page may set cookies and respond with a redirect to itself. + // Some example of such redirect "cycles": + // + // example.com -(set cookie)-> example.com + // example.com -> auth.example.com -(set cookie)-> example.com + // www.example.com -> example.com -(set cookie)-> example.com + // + // We must not return "already visited" error in such cases. + // So ignore redirect cycles when checking for URL revisit. + redirectCycle := false + normalizedURL := normalizeURL(req.URL.String()) + for _, viaReq := range via { + viaUrl := normalizeURL(viaReq.URL.String()) + if viaUrl == normalizedURL { + redirectCycle = true + break + } + } - if !c.AllowURLRevisit && !samePageRedirect { + if !c.AllowURLRevisit && !redirectCycle { var body io.ReadCloser if req.GetBody != nil { var err error diff --git a/colly_test.go b/colly_test.go index 2382ecb1..9b3a650b 100644 --- a/colly_test.go +++ b/colly_test.go @@ -783,6 +783,30 @@ func TestSetCookieRedirect(t *testing.T) { } } +func TestSetCookieComplexRedirectCycle(t *testing.T) { + // server2 -> server1 -(set cookie)-> server1 + ts1 := newUnstartedTestServer() + ts1.Config.Handler = requireSessionCookieSimple(ts1.Config.Handler) + ts1.Start() + defer ts1.Close() + + ts2 := httptest.NewServer(http.RedirectHandler(ts1.URL, http.StatusMovedPermanently)) + defer ts2.Close() + + c := NewCollector() + c.OnResponse(func(r *Response) { + if got, want := r.Body, serverIndexResponse; !bytes.Equal(got, want) { + t.Errorf("bad response body got=%q want=%q", got, want) + } + if got, want := r.StatusCode, http.StatusOK; got != want { + t.Errorf("bad response code got=%d want=%d", got, want) + } + }) + if err := c.Visit(ts2.URL); err != nil { + t.Fatal(err) + } +} + func TestCollectorPostURLRevisitCheck(t *testing.T) { ts := newTestServer() defer ts.Close()