archiveis/capture.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
package archiveis import ( "bytes" "errors" "fmt" "io" "log" "net/http" "net/url" "regexp" "strings" "time" "github.com/PuerkitoBio/goquery" ) var ( BaseURL = "https://archive.is" // Overrideable default package value. HTTPHost = "archive.is" // Overrideable default package value. UserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36" // Overrideable default package value. DefaultRequestTimeout = 10 * time.Second // Overrideable default package value. DefaultPollInterval = 5 * time.Second // Overrideable default package value. jsLocationExpr = regexp.MustCompile(`document\.location\.replace\(["']([^"']+)`) ) // Config settings for page capture client behavior. type Config struct { Anyway bool // Force archival even if there is already a recent snapshot of the page. Wait bool // Wait until the crawl has been completed. WaitTimeout time.Duration // Max time to wait for crawl completion. Default is unlimited. PollInterval time.Duration // Interval between crawl completion checks. Defaults to 5s. RequestTimeout time.Duration // Overrides default request timeout. SubmitID string // Accepts a user-provided submitid. } // Capture archives the provided URL using the archive.is service. func Capture(u string, cfg ...Config) (string, error) { timeout := DefaultRequestTimeout if len(cfg) > 0 && cfg[0].RequestTimeout > time.Duration(0) { timeout = cfg[0].RequestTimeout } var ( submitID string anyway string body []byte resp *http.Response final string err error ) if len(cfg) > 0 && len(cfg[0].SubmitID) > 0 { submitID = cfg[0].SubmitID log.Printf("Will use caller-provided submitid=%v", submitID) } else if submitID, err = newSubmitID(timeout); err != nil { return "", err } if len(cfg) > 0 && cfg[0].Anyway { anyway = "&anyway=1" } content := fmt.Sprintf("submitid=%v&url=%v%v", url.QueryEscape(submitID), url.QueryEscape(u), anyway) resp, body, err = doRequest("POST", BaseURL+"/submit/", io.NopCloser(bytes.NewBufferString(content)), timeout) if err != nil { return "", err } if resp.StatusCode/100 == 3 { // Page has already been archived. log.Print("Detected redirect to archived page") if loc := resp.Header.Get("Location"); len(loc) == 0 { return "", fmt.Errorf("received a redirect status-code %v with an empty Location header", resp.StatusCode) } else { final = loc } } else { // log.Printf("body: %+v\n", string(body)) // log.Printf("headers: %+v\n", resp.Header) // log.Printf("trailers: %+v\n", resp.Trailer) doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(body)) if err != nil { return "", fmt.Errorf("constructing goquery doc from submission response: %s", err) } if script := doc.Find("script").First(); script != nil { js := strings.Trim(script.Text(), "\r\n\t ") if match := jsLocationExpr.FindStringSubmatch(js); len(match) > 1 { final = match[1] } } if len(final) == 0 { input := doc.Find("input[name=id]").First() if input == nil { return "", errors.New("page archive ID not found in submission response content") } id, exists := input.Attr("value") if !exists { log.Printf("No page archive ID value detected, here was the page content: %v", string(body)) return "", errors.New("no page archive ID value available") } final = fmt.Sprintf("%v/%v", BaseURL, id) } } log.Printf("Capture for url=%v -> %v", u, final) if len(cfg) > 0 && cfg[0].Wait { var ( waitTimeout = cfg[0].WaitTimeout pollInterval = DefaultPollInterval ) if cfg[0].PollInterval > time.Duration(0) { pollInterval = cfg[0].PollInterval } if err := waitForCrawlToFinish(final, body, timeout, waitTimeout, pollInterval); err != nil { return final, err } } return final, nil } // newSubmitID gets the index page and extracts the form submission identifier. func newSubmitID(timeout time.Duration) (string, error) { _, body, err := doRequest("", BaseURL, nil, timeout) if err != nil { return "", err } doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(body)) if err != nil { return "", fmt.Errorf("constructing goquery doc from index: %s", err) } input := doc.Find("input[name=submitid]").First() if input == nil { return "", errors.New("no submitid element found") } id, exists := input.Attr("value") if !exists { return "", errors.New("no submitid value available") } return id, nil } func waitForCrawlToFinish(url string, body []byte, requestTimeout time.Duration, waitTimeout time.Duration, pollInterval time.Duration) error { var ( expr = regexp.MustCompile(`<html><body>`) until = time.Now().Add(waitTimeout) d = time.Now().Sub(until) err error ) if body != nil && !expr.Match(body) { // log.WithField("url", url).WithField("wait-timeout", waitTimeout).WithField("poll-interval", pollInterval).Printf("Detected crawl completion after %s", d) if err := checkCrawlResult(body); err != nil { return err } return nil } // log.WithField("url", url).WithField("wait-timeout", waitTimeout).WithField("poll-interval", pollInterval).Debug("Waiting for crawl to finish") for { if waitTimeout != time.Duration(0) && time.Now().After(until) { return fmt.Errorf("timed out after %s waiting for crawl to complete", waitTimeout) } _, body, err = doRequest("", url, nil, requestTimeout) d = time.Now().Sub(until) if err != nil { log.Printf("Non-fatal error while polling for crawl completion: %s (continuing on, waiting for %s so far)", err, d) } else if !expr.Match(body) { // log.WithField("url", url).WithField("wait-timeout", waitTimeout).WithField("poll-interval", pollInterval).Printf("Detected crawl completion after %s", d) break } time.Sleep(pollInterval) } return nil } // checkCrawlResult searches for known archive.is errors in HTML content. func checkCrawlResult(body []byte) error { doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(body)) if err != nil { return fmt.Errorf("crawl result check gq new doc: %s", err) } if block := doc.Find("html > body > div").First(); block != nil { if text := strings.Trim(block.Text(), "\r\n\t "); text == "Error: Network error." { return fmt.Errorf("archive.is crawl result: Network Error") } } return nil }