Improve Content-Type parsing

WGH- · WGH- · commit 31f0876c9912 · 2024-03-27T17:57:16.000+03:00
Instead of looking for "html" substring, actually parse the MIME type
string. Don't use mime.ParseMediaType though as it doesn't handle
invalid duplicate parameters (e.g. "text/html; charset=UTF-8; charset=utf-8")
that occur in the wild.
diff --git a/colly.go b/colly.go
@@ -1117,9 +1117,24 @@ func (c *Collector) handleOnResponseHeaders(r *Response) {
 }
 
 func (c *Collector) handleOnHTML(resp *Response) error {
-	if len(c.htmlCallbacks) == 0 || !strings.Contains(strings.ToLower(resp.Headers.Get("Content-Type")), "html") {
+	if len(c.htmlCallbacks) == 0 {
 		return nil
 	}
+
+	contentType := resp.Headers.Get("Content-Type")
+	// implementation of mime.ParseMediaType without parsing the params
+	// part
+	mediatype, _, _ := strings.Cut(contentType, ";")
+	mediatype = strings.TrimSpace(strings.ToLower(mediatype))
+
+	// TODO we also want to parse application/xml as XHTML if it has
+	// appropriate doctype
+	switch mediatype {
+	case "text/html", "application/xhtml+xml":
+	default:
+		return nil
+	}
+
 	doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(resp.Body))
 	if err != nil {
 		return err