Skip to content

Commit d8c4e07

Browse files
committed
[fix] handle xml/html parsing errors
fixes #135
1 parent 46b668d commit d8c4e07

File tree

1 file changed

+20
-12
lines changed

1 file changed

+20
-12
lines changed

colly.go

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -587,13 +587,19 @@ func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ct
587587

588588
c.handleOnResponse(response)
589589

590-
c.handleOnHTML(response)
590+
err = c.handleOnHTML(response)
591+
if err != nil {
592+
c.handleOnError(response, err, request, ctx)
593+
}
591594

592-
c.handleOnXML(response)
595+
err = c.handleOnXML(response)
596+
if err != nil {
597+
c.handleOnError(response, err, request, ctx)
598+
}
593599

594600
c.handleOnScraped(response)
595601

596-
return nil
602+
return err
597603
}
598604

599605
func (c *Collector) requestCheck(u, method string, depth int, checkRevisit bool) error {
@@ -912,13 +918,13 @@ func (c *Collector) handleOnResponse(r *Response) {
912918
}
913919
}
914920

915-
func (c *Collector) handleOnHTML(resp *Response) {
921+
func (c *Collector) handleOnHTML(resp *Response) error {
916922
if len(c.htmlCallbacks) == 0 || !strings.Contains(strings.ToLower(resp.Headers.Get("Content-Type")), "html") {
917-
return
923+
return nil
918924
}
919925
doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(resp.Body))
920926
if err != nil {
921-
return
927+
return err
922928
}
923929
if href, found := doc.Find("base[href]").Attr("href"); found {
924930
resp.Request.baseURL, _ = url.Parse(href)
@@ -937,21 +943,22 @@ func (c *Collector) handleOnHTML(resp *Response) {
937943
}
938944
})
939945
}
946+
return nil
940947
}
941948

942-
func (c *Collector) handleOnXML(resp *Response) {
949+
func (c *Collector) handleOnXML(resp *Response) error {
943950
if len(c.xmlCallbacks) == 0 {
944-
return
951+
return nil
945952
}
946953
contentType := strings.ToLower(resp.Headers.Get("Content-Type"))
947954
if !strings.Contains(contentType, "html") && !strings.Contains(contentType, "xml") {
948-
return
955+
return nil
949956
}
950957

951958
if strings.Contains(contentType, "html") {
952959
doc, err := htmlquery.Parse(bytes.NewBuffer(resp.Body))
953960
if err != nil {
954-
return
961+
return err
955962
}
956963
if e := htmlquery.FindOne(doc, "//base/@href"); e != nil {
957964
for _, a := range e.Attr {
@@ -977,7 +984,7 @@ func (c *Collector) handleOnXML(resp *Response) {
977984
} else if strings.Contains(contentType, "xml") {
978985
doc, err := xmlquery.Parse(bytes.NewBuffer(resp.Body))
979986
if err != nil {
980-
return
987+
return err
981988
}
982989

983990
for _, cc := range c.xmlCallbacks {
@@ -993,13 +1000,14 @@ func (c *Collector) handleOnXML(resp *Response) {
9931000
})
9941001
}
9951002
}
1003+
return nil
9961004
}
9971005

9981006
func (c *Collector) handleOnError(response *Response, err error, request *Request, ctx *Context) error {
9991007
if err == nil && (c.ParseHTTPErrorResponse || response.StatusCode < 203) {
10001008
return nil
10011009
}
1002-
if err == nil {
1010+
if err == nil && response.StatusCode >= 203 {
10031011
err = errors.New(http.StatusText(response.StatusCode))
10041012
}
10051013
if response == nil {

0 commit comments

Comments
 (0)