Skip to content

Commit 31f0876

Browse files
committed
Improve Content-Type parsing
Instead of looking for "html" substring, actually parse the MIME type string. Don't use mime.ParseMediaType though as it doesn't handle invalid duplicate parameters (e.g. "text/html; charset=UTF-8; charset=utf-8") that occur in the wild.
1 parent 26a5648 commit 31f0876

File tree

1 file changed

+16
-1
lines changed

1 file changed

+16
-1
lines changed

colly.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1117,9 +1117,24 @@ func (c *Collector) handleOnResponseHeaders(r *Response) {
11171117
}
11181118

11191119
func (c *Collector) handleOnHTML(resp *Response) error {
1120-
if len(c.htmlCallbacks) == 0 || !strings.Contains(strings.ToLower(resp.Headers.Get("Content-Type")), "html") {
1120+
if len(c.htmlCallbacks) == 0 {
11211121
return nil
11221122
}
1123+
1124+
contentType := resp.Headers.Get("Content-Type")
1125+
// implementation of mime.ParseMediaType without parsing the params
1126+
// part
1127+
mediatype, _, _ := strings.Cut(contentType, ";")
1128+
mediatype = strings.TrimSpace(strings.ToLower(mediatype))
1129+
1130+
// TODO we also want to parse application/xml as XHTML if it has
1131+
// appropriate doctype
1132+
switch mediatype {
1133+
case "text/html", "application/xhtml+xml":
1134+
default:
1135+
return nil
1136+
}
1137+
11231138
doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(resp.Body))
11241139
if err != nil {
11251140
return err

0 commit comments

Comments
 (0)