Skip to content

Commit 6581387

Browse files
authored
Merge pull request #479 from WGH-/fix-relative-base
Fix relative <base> URL
2 parents 06c3255 + 6058416 commit 6581387

File tree

2 files changed

+42
-2
lines changed

2 files changed

+42
-2
lines changed

colly.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1057,7 +1057,7 @@ func (c *Collector) handleOnHTML(resp *Response) error {
10571057
return err
10581058
}
10591059
if href, found := doc.Find("base[href]").Attr("href"); found {
1060-
resp.Request.baseURL, _ = url.Parse(href)
1060+
resp.Request.baseURL, _ = resp.Request.URL.Parse(href)
10611061
}
10621062
for _, cc := range c.htmlCallbacks {
10631063
i := 0
@@ -1096,7 +1096,7 @@ func (c *Collector) handleOnXML(resp *Response) error {
10961096
if e := htmlquery.FindOne(doc, "//base"); e != nil {
10971097
for _, a := range e.Attr {
10981098
if a.Key == "href" {
1099-
resp.Request.baseURL, _ = url.Parse(a.Val)
1099+
resp.Request.baseURL, _ = resp.Request.URL.Parse(a.Val)
11001100
break
11011101
}
11021102
}

colly_test.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,21 @@ func newTestServer() *httptest.Server {
139139
`))
140140
})
141141

142+
mux.HandleFunc("/base_relative", func(w http.ResponseWriter, r *http.Request) {
143+
w.Header().Set("Content-Type", "text/html")
144+
w.Write([]byte(`<!DOCTYPE html>
145+
<html>
146+
<head>
147+
<title>Test Page</title>
148+
<base href="/foobar/" />
149+
</head>
150+
<body>
151+
<a href="z">link</a>
152+
</body>
153+
</html>
154+
`))
155+
})
156+
142157
mux.HandleFunc("/large_binary", func(w http.ResponseWriter, r *http.Request) {
143158
w.Header().Set("Content-Type", "application/octet-stream")
144159
ww := bufio.NewWriter(w)
@@ -767,6 +782,31 @@ func TestBaseTag(t *testing.T) {
767782
c2.Visit(ts.URL + "/base")
768783
}
769784

785+
func TestBaseTagRelative(t *testing.T) {
786+
ts := newTestServer()
787+
defer ts.Close()
788+
789+
c := NewCollector()
790+
c.OnHTML("a[href]", func(e *HTMLElement) {
791+
u := e.Request.AbsoluteURL(e.Attr("href"))
792+
expected := ts.URL + "/foobar/z"
793+
if u != expected {
794+
t.Errorf("Invalid <base /> tag handling in OnHTML: expected %q, got %q", expected, u)
795+
}
796+
})
797+
c.Visit(ts.URL + "/base_relative")
798+
799+
c2 := NewCollector()
800+
c2.OnXML("//a", func(e *XMLElement) {
801+
u := e.Request.AbsoluteURL(e.Attr("href"))
802+
expected := ts.URL + "/foobar/z"
803+
if u != expected {
804+
t.Errorf("Invalid <base /> tag handling in OnXML: expected %q, got %q", expected, u)
805+
}
806+
})
807+
c2.Visit(ts.URL + "/base_relative")
808+
}
809+
770810
func TestCollectorCookies(t *testing.T) {
771811
ts := newTestServer()
772812
defer ts.Close()

0 commit comments

Comments
 (0)