small pixel drawing of a pufferfish vore

archive.is -> archive.org
Jes Olson j3s@c3f.net
Fri, 21 Feb 2025 19:41:59 -0500
commit

8578094d392492921f0d65aab43a83d51cd6fc76

parent

9d6fb2a0444fa32ab8492924d49b3153f058241c

9 files changed, 136 insertions(+), 327 deletions(-)

jump to
D archiveis/capture.go

@@ -1,206 +0,0 @@

-package archiveis - -import ( - "bytes" - "errors" - "fmt" - "io" - "log" - "net/http" - "net/url" - "regexp" - "strings" - "time" - - "github.com/PuerkitoBio/goquery" -) - -var ( - BaseURL = "https://archive.is" // Overrideable default package value. - HTTPHost = "archive.is" // Overrideable default package value. - UserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36" // Overrideable default package value. - DefaultRequestTimeout = 10 * time.Second // Overrideable default package value. - DefaultPollInterval = 5 * time.Second // Overrideable default package value. - - jsLocationExpr = regexp.MustCompile(`document\.location\.replace\(["']([^"']+)`) -) - -// Config settings for page capture client behavior. -type Config struct { - Anyway bool // Force archival even if there is already a recent snapshot of the page. - Wait bool // Wait until the crawl has been completed. - WaitTimeout time.Duration // Max time to wait for crawl completion. Default is unlimited. - PollInterval time.Duration // Interval between crawl completion checks. Defaults to 5s. - RequestTimeout time.Duration // Overrides default request timeout. - SubmitID string // Accepts a user-provided submitid. -} - -// Capture archives the provided URL using the archive.is service. -func Capture(u string, cfg ...Config) (string, error) { - timeout := DefaultRequestTimeout - if len(cfg) > 0 && cfg[0].RequestTimeout > time.Duration(0) { - timeout = cfg[0].RequestTimeout - } - - var ( - submitID string - anyway string - body []byte - resp *http.Response - final string - err error - ) - - if len(cfg) > 0 && len(cfg[0].SubmitID) > 0 { - submitID = cfg[0].SubmitID - log.Printf("Will use caller-provided submitid=%v", submitID) - } else if submitID, err = newSubmitID(timeout); err != nil { - return "", err - } - - if len(cfg) > 0 && cfg[0].Anyway { - anyway = "&anyway=1" - } - - content := fmt.Sprintf("submitid=%v&url=%v%v", url.QueryEscape(submitID), url.QueryEscape(u), anyway) - - resp, body, err = doRequest("POST", BaseURL+"/submit/", io.NopCloser(bytes.NewBufferString(content)), timeout) - if err != nil { - return "", err - } - - if resp.StatusCode/100 == 3 { - // Page has already been archived. - log.Print("Detected redirect to archived page") - - if loc := resp.Header.Get("Location"); len(loc) == 0 { - return "", fmt.Errorf("received a redirect status-code %v with an empty Location header", resp.StatusCode) - } else { - final = loc - } - } else { - // log.Printf("body: %+v\n", string(body)) - // log.Printf("headers: %+v\n", resp.Header) - // log.Printf("trailers: %+v\n", resp.Trailer) - - doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(body)) - if err != nil { - return "", fmt.Errorf("constructing goquery doc from submission response: %s", err) - } - - if script := doc.Find("script").First(); script != nil { - js := strings.Trim(script.Text(), "\r\n\t ") - if match := jsLocationExpr.FindStringSubmatch(js); len(match) > 1 { - final = match[1] - } - } - - if len(final) == 0 { - input := doc.Find("input[name=id]").First() - if input == nil { - return "", errors.New("page archive ID not found in submission response content") - } - id, exists := input.Attr("value") - if !exists { - log.Printf("No page archive ID value detected, here was the page content: %v", string(body)) - return "", errors.New("no page archive ID value available") - } - - final = fmt.Sprintf("%v/%v", BaseURL, id) - } - } - - log.Printf("Capture for url=%v -> %v", u, final) - - if len(cfg) > 0 && cfg[0].Wait { - var ( - waitTimeout = cfg[0].WaitTimeout - pollInterval = DefaultPollInterval - ) - - if cfg[0].PollInterval > time.Duration(0) { - pollInterval = cfg[0].PollInterval - } - - if err := waitForCrawlToFinish(final, body, timeout, waitTimeout, pollInterval); err != nil { - return final, err - } - } - - return final, nil -} - -// newSubmitID gets the index page and extracts the form submission identifier. -func newSubmitID(timeout time.Duration) (string, error) { - _, body, err := doRequest("", BaseURL, nil, timeout) - if err != nil { - return "", err - } - - doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(body)) - if err != nil { - return "", fmt.Errorf("constructing goquery doc from index: %s", err) - } - - input := doc.Find("input[name=submitid]").First() - if input == nil { - return "", errors.New("no submitid element found") - } - id, exists := input.Attr("value") - if !exists { - return "", errors.New("no submitid value available") - } - return id, nil -} - -func waitForCrawlToFinish(url string, body []byte, requestTimeout time.Duration, waitTimeout time.Duration, pollInterval time.Duration) error { - var ( - expr = regexp.MustCompile(`<html><body>`) - until = time.Now().Add(waitTimeout) - d = time.Now().Sub(until) - err error - ) - - if body != nil && !expr.Match(body) { - // log.WithField("url", url).WithField("wait-timeout", waitTimeout).WithField("poll-interval", pollInterval).Printf("Detected crawl completion after %s", d) - if err := checkCrawlResult(body); err != nil { - return err - } - return nil - } - - // log.WithField("url", url).WithField("wait-timeout", waitTimeout).WithField("poll-interval", pollInterval).Debug("Waiting for crawl to finish") - for { - if waitTimeout != time.Duration(0) && time.Now().After(until) { - return fmt.Errorf("timed out after %s waiting for crawl to complete", waitTimeout) - } - - _, body, err = doRequest("", url, nil, requestTimeout) - - d = time.Now().Sub(until) - - if err != nil { - log.Printf("Non-fatal error while polling for crawl completion: %s (continuing on, waiting for %s so far)", err, d) - } else if !expr.Match(body) { - // log.WithField("url", url).WithField("wait-timeout", waitTimeout).WithField("poll-interval", pollInterval).Printf("Detected crawl completion after %s", d) - break - } - - time.Sleep(pollInterval) - } - return nil -} - -// checkCrawlResult searches for known archive.is errors in HTML content. -func checkCrawlResult(body []byte) error { - doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(body)) - if err != nil { - return fmt.Errorf("crawl result check gq new doc: %s", err) - } - if block := doc.Find("html > body > div").First(); block != nil { - if text := strings.Trim(block.Text(), "\r\n\t "); text == "Error: Network error." { - return fmt.Errorf("archive.is crawl result: Network Error") - } - } - return nil -}
D archiveis/http.go

@@ -1,74 +0,0 @@

-package archiveis - -import ( - "fmt" - "io" - "net" - "net/http" - "strings" - "time" -) - -func doRequest(method string, url string, body io.ReadCloser, timeout time.Duration) (*http.Response, []byte, error) { - req, err := newRequest(method, url, body) - if err != nil { - return nil, nil, err - } - - if method != "" && method != "get" { - req.Header.Set("content-type", "application/x-www-form-urlencoded") - } - - client := newClient(timeout) - resp, err := client.Do(req) - if err != nil { - return resp, nil, fmt.Errorf("executing request: %s", err) - } - if resp.StatusCode/100 != 2 && resp.StatusCode/100 != 3 { - return resp, nil, fmt.Errorf("%v request to %v received unhappy response status-code=%v", method, url, resp.StatusCode) - } - respBody, err := io.ReadAll(resp.Body) - if err != nil { - return resp, nil, fmt.Errorf("reading response body: %s", err) - } - if err := resp.Body.Close(); err != nil { - return resp, respBody, fmt.Errorf("closing response body: %s", err) - } - return resp, respBody, nil -} - -func newRequest(method string, url string, body io.ReadCloser) (*http.Request, error) { - req, err := http.NewRequest(method, url, body) - if err != nil { - return nil, fmt.Errorf("creating %v request to %v: %s", method, url, err) - } - - req.Host = HTTPHost - - hostname := strings.Split(BaseURL, "://")[1] - req.Header.Set("Host", hostname) - req.Header.Set("Origin", hostname) - req.Header.Set("Authority", hostname) - req.Header.Set("User-Agent", UserAgent) - req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8") - req.Header.Set("Referer", BaseURL+"/") - - return req, nil -} - -func newClient(timeout time.Duration) *http.Client { - c := &http.Client{ - Timeout: timeout, - Transport: &http.Transport{ - Proxy: http.ProxyFromEnvironment, - Dial: (&net.Dialer{ - Timeout: timeout, - KeepAlive: timeout, - }).Dial, - TLSHandshakeTimeout: timeout, - ResponseHeaderTimeout: timeout, - ExpectContinueTimeout: 1 * time.Second, - }, - } - return c -}
D archiveis/readme

@@ -1,1 +0,0 @@

-forked from https://github.com/jaytaylor/archive.is
M files/settings.tmpl.htmlfiles/settings.tmpl.html

@@ -32,7 +32,7 @@ and press [subscribe]

https://100r.co/links/rss.xml https://begriffs.com/atom.xml -https://blog.passtheballsocrates.com/feed/ +https://blog.stillgreenmoss.net/feed/ https://facklambda.dev/atom.xml https://herman.bearblog.dev/feed/ https://j3s.sh/feed.atom
M go.modgo.mod

@@ -3,19 +3,16 @@

go 1.22 require ( - github.com/PuerkitoBio/goquery v1.9.1 github.com/axgle/mahonia v0.0.0-20180208002826-3358181d7394 github.com/glebarez/go-sqlite v1.21.2 golang.org/x/crypto v0.19.0 ) require ( - github.com/andybalholm/cascadia v1.3.2 // indirect github.com/dustin/go-humanize v1.0.1 // indirect github.com/google/uuid v1.3.0 // indirect github.com/mattn/go-isatty v0.0.19 // indirect github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect - golang.org/x/net v0.21.0 // indirect golang.org/x/sys v0.17.0 // indirect modernc.org/libc v1.24.1 // indirect modernc.org/mathutil v1.6.0 // indirect
M go.sumgo.sum

@@ -1,7 +1,3 @@

-github.com/PuerkitoBio/goquery v1.9.1 h1:mTL6XjbJTZdpfL+Gwl5U2h1l9yEkJjhmlTeV9VPW7UI= -github.com/PuerkitoBio/goquery v1.9.1/go.mod h1:cW1n6TmIMDoORQU5IU/P1T3tGFunOeXEpGP2WHRwkbY= -github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= -github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= github.com/axgle/mahonia v0.0.0-20180208002826-3358181d7394 h1:OYA+5W64v3OgClL+IrOD63t4i/RW7RqrAVl9LTZ9UqQ= github.com/axgle/mahonia v0.0.0-20180208002826-3358181d7394/go.mod h1:Q8n74mJTIgjX4RBBcHnJ05h//6/k6foqmgE45jTQtxg= github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=

@@ -16,47 +12,11 @@ github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA=

github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= -github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.19.0 h1:ENy+Az/9Y1vSrlrvBSyna3PITt4tiZLf7sgCjZBX7Wo= golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= -golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= -golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= -golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= -golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= -golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= modernc.org/libc v1.24.1 h1:uvJSeCKL/AgzBo2yYIPPTy82v21KgGnizcGYfBHaNuM= modernc.org/libc v1.24.1/go.mod h1:FmfO1RLrU3MHJfyi9eYYmZBfi/R+tqZ6+hQ3yQQUkak= modernc.org/mathutil v1.6.0 h1:fRe9+AmYlaej+64JsEEhoWuAYBkOtQiMEU7n/XgfYi4=
M site.gosite.go

@@ -1,6 +1,7 @@

package main import ( + "context" "errors" "fmt" "html/template"

@@ -13,11 +14,11 @@ "path/filepath"

"strings" "time" - "git.j3s.sh/vore/archiveis" "git.j3s.sh/vore/lib" "git.j3s.sh/vore/reaper" "git.j3s.sh/vore/rss" "git.j3s.sh/vore/sqlite" + "git.j3s.sh/vore/wayback" "golang.org/x/crypto/bcrypt" )

@@ -138,7 +139,10 @@ if err != nil {

fmt.Fprintf(w, "error!") return } - archiveURL, err := archiveis.Capture(decodedURL) + + c := wayback.Client{} + + archiveURL, err := c.Archive(context.Background(), decodedURL) if err != nil { log.Println(err) fmt.Fprintf(w, "error capturing archive!!")
A wayback/readme

@@ -0,0 +1,1 @@

+forked from https://github.com/wabarc/archive.org/blob/main/ia.go
A wayback/wayback.go

@@ -0,0 +1,128 @@

+package wayback + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "regexp" +) + +type Client struct { + httpClient *http.Client +} + +const userAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36" + +var ( + host = "archive.org" + dest = "https://web." + host + base = "https://web.archive.org/save/" + + endpoint = "https://archive.org/wayback/available" +) + +// Wayback is the handle of saving webpages to archive.org +func (wbrc *Client) Archive(ctx context.Context, u string) (result string, err error) { + if wbrc.httpClient == nil { + wbrc.httpClient = &http.Client{ + CheckRedirect: noRedirect, + } + } + + result, err = wbrc.archive(ctx, u) + if err != nil { + return + } + return +} + +func (wbrc *Client) archive(ctx context.Context, uri string) (string, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, base+uri, nil) + if err != nil { + return "", err + } + req.Header.Add("User-Agent", userAgent) + resp, err := wbrc.httpClient.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + + var loc string + loc = resp.Header.Get("Content-Location") + + if len(loc) > 0 { + return loc, nil + } + + loc = resp.Header.Get("Location") + if len(loc) > 0 { + return loc, nil + } + + links := resp.Header.Get("Link") + re := regexp.MustCompile(`(?m)http[s]?:\/\/web\.archive\.org/web/[-a-zA-Z0-9@:%_\+.~#?&//=]*`) + if match := re.FindAllString(links, -1); len(match) > 0 { + loc = match[len(match)-1] + return fmt.Sprintf("%v", loc), nil + } + + loc = resp.Request.URL.String() + if match := re.FindAllString(loc, -1); len(match) > 0 { + return fmt.Sprintf("%v", loc), nil + } + + loc, err = wbrc.latest(ctx, uri) + if err != nil { + loc = base + uri + } + + // HTTP 509 Bandwidth Limit Exceeded + if resp.StatusCode == 509 { + return fmt.Sprint(loc), nil + } + + if resp.StatusCode != 200 { + return fmt.Sprint(loc), nil + } + + return loc, nil +} + +func (wbrc *Client) latest(_ context.Context, u string) (string, error) { + // https://web.archive.org/*/https://example.org + result := fmt.Sprintf("%s/*/%s", dest, u) + + uri := endpoint + "?url=" + u + resp, err := wbrc.httpClient.Get(uri) + if err != nil { + return "", err + } + defer resp.Body.Close() + + data, err := io.ReadAll(resp.Body) + if err != nil { + return "", err + } + + var dat map[string]interface{} + if err := json.Unmarshal(data, &dat); err != nil { + return "", err + } + + if archived, ok := dat["archived_snapshots"].(map[string]interface{}); ok { + if closest, ok := archived["closest"].(map[string]interface{}); ok { + if closest["available"].(bool) && closest["status"] == "200" { + return closest["url"].(string), nil + } + } + } + + return result, fmt.Errorf("Not found") +} + +func noRedirect(req *http.Request, via []*http.Request) error { + return http.ErrUseLastResponse +}