add initial favicon fetcher
Jes Olson j3s@c3f.net
Tue, 30 Sep 2025 10:09:50 -0500
4 files changed,
340 insertions(+),
17 deletions(-)
A
favicon/favicon.go
@@ -0,0 +1,287 @@
+package favicon + +import ( + "context" + "encoding/base64" + "fmt" + "io" + "log" + "net/http" + "net/url" + "slices" + "strings" + "sync" + "time" + + "golang.org/x/net/html" +) + +type FaviconCache struct { + // domain names -> b64 data URLs + cache map[string]string + mutex sync.RWMutex +} + +type FaviconFetcher struct { + cache *FaviconCache + client *http.Client +} + +func NewFaviconFetcher() *FaviconFetcher { + return &FaviconFetcher{ + cache: &FaviconCache{ + cache: make(map[string]string), + }, + client: &http.Client{ + Timeout: 10 * time.Second, + }, + } +} + +func (f *FaviconFetcher) GetFaviconDataURL(domain string) string { + f.cache.mutex.RLock() + defer f.cache.mutex.RUnlock() + + return f.cache.cache[domain] +} + +func (f *FaviconFetcher) FetchFaviconsForDomains(feedURLs []string) { + domains := f.extractUniqueDomains(feedURLs) + + log.Printf("favicon: starting to fetch favicons for %d unique domains", len(domains)) + + const maxWorkers = 5 + domainChan := make(chan string, len(domains)) + var wg sync.WaitGroup + + for range maxWorkers { + wg.Add(1) + go func() { + defer wg.Done() + for domain := range domainChan { + f.fetchFaviconForDomain(domain) + } + }() + } + + for _, domain := range domains { + domainChan <- domain + } + close(domainChan) + + wg.Wait() + + log.Printf("favicon: finished fetching favicons, cached %d successful results", len(f.cache.cache)) +} + +// extractUniqueDomains extracts unique domain names from a list of URLs +func (f *FaviconFetcher) extractUniqueDomains(urls []string) []string { + // use a map as a cheap "uniqueness" filter okie dokie + domainSet := make(map[string]bool) + + for _, rawURL := range urls { + parsedURL, err := url.Parse(rawURL) + if err != nil { + continue + } + + domain := parsedURL.Hostname() + if domain != "" { + domainSet[domain] = true + } + } + + domains := make([]string, 0, len(domainSet)) + for domain := range domainSet { + domains = append(domains, domain) + } + + return domains +} + +// fetchFaviconForDomain attempts to fetch a favicon for a specific domain +// It first tries to parse the HTML page to find favicon link tags, then falls back to common paths +func (f *FaviconFetcher) fetchFaviconForDomain(domain string) { + baseURL := fmt.Sprintf("https://%s", domain) + + faviconURLs := f.discoverFaviconFromHTML(baseURL) + faviconURLs = append(faviconURLs, + fmt.Sprintf("https://%s/favicon.ico", domain), + fmt.Sprintf("https://%s/favicon.png", domain), + fmt.Sprintf("https://%s/apple-touch-icon.png", domain), + fmt.Sprintf("https://%s/apple-touch-icon-precomposed.png", domain), + ) + + for _, faviconURL := range faviconURLs { + dataURL := f.fetchFaviconDataURL(faviconURL) + if dataURL != "" { + f.cache.mutex.Lock() + f.cache.cache[domain] = dataURL + f.cache.mutex.Unlock() + return + } + } + + log.Printf("favicon: no favicon found for domain %s", domain) +} + +// fetchFaviconDataURL fetches a favicon from the given URL and converts it to a data URL +// Returns empty string if the favicon could not be fetched or converted +func (f *FaviconFetcher) fetchFaviconDataURL(faviconURL string) string { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, faviconURL, nil) + if err != nil { + return "" + } + + req.Header.Set("User-Agent", "vore: favicon fetcher") + + resp, err := f.client.Do(req) + if err != nil { + return "" + } + defer resp.Body.Close() + + // 2xx == good + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return "" + } + + // make sure it looks like an img + contentType := resp.Header.Get("Content-Type") + if contentType != "" && !strings.HasPrefix(contentType, "image/") && + !strings.Contains(contentType, "icon") && !strings.Contains(contentType, "octet-stream") { + return "" + } + + const maxFaviconSize = 1024 * 1024 // 1MB limit + data, err := io.ReadAll(io.LimitReader(resp.Body, maxFaviconSize)) + if err != nil { + return "" + } + + if len(data) == 0 { + return "" + } + + if contentType == "" { + if strings.HasSuffix(faviconURL, ".png") { + contentType = "image/png" + } else if strings.HasSuffix(faviconURL, ".ico") { + contentType = "image/x-icon" + } else { + contentType = "image/x-icon" // fallback + } + } + + // data URL == base64 encoded favicon + encodedData := base64.StdEncoding.EncodeToString(data) + dataURL := fmt.Sprintf("data:%s;base64,%s", contentType, encodedData) + + return dataURL +} + +// discoverFaviconFromHTML fetches the HTML page and parses it to find favicon link tags +func (f *FaviconFetcher) discoverFaviconFromHTML(baseURL string) []string { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, baseURL, nil) + if err != nil { + return []string{} + } + + req.Header.Set("User-Agent", "vore: favicon fetcher") + + resp, err := f.client.Do(req) + if err != nil { + return []string{} + } + defer resp.Body.Close() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return []string{} + } + + doc, err := html.Parse(resp.Body) + if err != nil { + return []string{} + } + + parsedBaseURL, err := url.Parse(baseURL) + if err != nil { + return []string{} + } + + return f.extractFaviconURLsFromHTML(doc, parsedBaseURL) +} + +// extractFaviconURLsFromHTML walks the HTML document tree and extracts favicon URLs +func (f *FaviconFetcher) extractFaviconURLsFromHTML(doc *html.Node, baseURL *url.URL) []string { + var faviconURLs []string + var walkHTML func(*html.Node) + + walkHTML = func(n *html.Node) { + if n.Type == html.ElementNode && n.Data == "link" { + var rel, href, sizes string + for _, attr := range n.Attr { + switch attr.Key { + case "rel": + rel = strings.ToLower(attr.Val) + case "href": + href = attr.Val + case "sizes": + sizes = attr.Val + } + } + + // check if this is a favicon-related link + if f.isFaviconRel(rel) && href != "" { + // make href absolute if necessary + faviconURL, err := baseURL.Parse(href) + if err == nil { + // prioritize larger icons by putting them first + if f.isLargerIcon(sizes) { + faviconURLs = append([]string{faviconURL.String()}, faviconURLs...) + } else { + faviconURLs = append(faviconURLs, faviconURL.String()) + } + } + } + } + + // recursively walk child nodes + for c := n.FirstChild; c != nil; c = c.NextSibling { + walkHTML(c) + } + } + + walkHTML(doc) + return faviconURLs +} + +// isFaviconRel checks if a link rel attribute indicates a favicon +func (f *FaviconFetcher) isFaviconRel(rel string) bool { + faviconRels := []string{ + "icon", + "shortcut icon", + "apple-touch-icon", + "apple-touch-icon-precomposed", + "mask-icon", + } + + if slices.Contains(faviconRels, rel) { + return true + } + return false +} + +// isLargerIcon checks if the sizes attribute indicates a larger icon (for prioritization) +func (f *FaviconFetcher) isLargerIcon(sizes string) bool { + return strings.Contains(sizes, "32x32") || + strings.Contains(sizes, "64x64") || + strings.Contains(sizes, "128x128") || + strings.Contains(sizes, "192x192") +}
M
files/user.tmpl.html
→
files/user.tmpl.html
@@ -15,7 +15,7 @@ <ul>
{{ range .Data.Items }} <li{{ if and $.LoggedIn (index $.Data.ReadItems .Link) }} class="read"{{ end }}> <a href="{{ if $.LoggedIn }}/read/{{ .Link | escapeURL }}{{ else }}{{ .Link }}{{ end }}"> - {{ if .Title }} {{ .Title }} {{ else }} (empty title) {{ end }} + {{ .Link | faviconForURL }}{{ if .Title }} {{ .Title }} {{ else }} (empty title) {{ end }} </a> <br> <span class=puny title="{{ .Date }}">
M
reaper/reaper.go
→
reaper/reaper.go
@@ -33,7 +33,7 @@ if err != nil {
return nil, err } - req.Header.Set("User-Agent", "Vore") + req.Header.Set("User-Agent", "vore: feed fetcher") fid, exists := r.db.GetFeedIDAndExists(url) if exists {
M
site.go
→
site.go
@@ -4,6 +4,7 @@ import (
"context" "errors" "fmt" + "html" "html/template" "log" "math/rand"@@ -14,13 +15,14 @@ "path/filepath"
"strings" "time" + "git.j3s.sh/vore/favicon" "git.j3s.sh/vore/lib" "git.j3s.sh/vore/reaper" "git.j3s.sh/vore/rss" "git.j3s.sh/vore/sqlite" "git.j3s.sh/vore/wayback" "golang.org/x/crypto/bcrypt" - "golang.org/x/net/html" + nethtml "golang.org/x/net/html" ) type Site struct {@@ -32,6 +34,9 @@ reaper *reaper.Reaper
// site database handle db *sqlite.DB + + // favicon fetcher for caching favicons + faviconFetcher *favicon.FaviconFetcher } type Save struct {@@ -47,11 +52,24 @@ // - busy_timeout=5000: locky locky 5 secs
// - synchronous=NORMAL: "The synchronous=NORMAL setting is a good choice for most applications running in WAL mode." // - cache_size=-64000: 64MB ram for db cache (yum yum more perf) db := sqlite.New("vore.db?_pragma=journal_mode(WAL)&_pragma=foreign_keys(ON)&_pragma=busy_timeout(5000)&_pragma=synchronous(NORMAL)&_pragma=cache_size(-64000)") + + // init favicon fetcher + faviconFetcher := favicon.NewFaviconFetcher() + s := Site{ - title: "vore", - reaper: reaper.New(db), - db: db, + title: "vore", + reaper: reaper.New(db), + db: db, + faviconFetcher: faviconFetcher, } + + // favi fetchy - every day or so + go func() { + log.Println("favicon: starting favicon fetch for all feed domains") + feedURLs := db.GetAllFeedURLs() + faviconFetcher.FetchFaviconsForDomains(feedURLs) + }() + return &s }@@ -177,12 +195,12 @@ return
} items := s.reaper.TrimFuturePosts(s.reaper.SortFeedItemsByDate(s.reaper.GetUserFeeds(username))) - + var readItems map[string]bool if s.loggedIn(r) { readItems = s.db.GetUserReadItems(s.username(r)) } - + data := struct { User string Items []*rss.Item@@ -338,7 +356,7 @@ http.Error(w, "non-2xx status from site: "+resp.Status, http.StatusBadGateway)
return } - doc, err := html.Parse(resp.Body) + doc, err := nethtml.Parse(resp.Body) if err != nil { http.Error(w, "failed to parse HTML: "+err.Error(), http.StatusInternalServerError) return@@ -368,11 +386,11 @@ fmt.Fprintln(w, `</body></html>`)
} } -func discoverFeeds(doc *html.Node, base *url.URL) []string { +func discoverFeeds(doc *nethtml.Node, base *url.URL) []string { var feeds []string - var f func(*html.Node) - f = func(n *html.Node) { - if n.Type == html.ElementNode && n.Data == "link" { + var f func(*nethtml.Node) + f = func(n *nethtml.Node) { + if n.Type == nethtml.ElementNode && n.Data == "link" { var rel, typ, href string for _, attr := range n.Attr { switch attr.Key {@@ -480,10 +498,11 @@ // template execution engine. it's normally the last thing a
// handler should do tbh. func (s *Site) renderPage(w http.ResponseWriter, r *http.Request, page string, data any) { funcMap := template.FuncMap{ - "printDomain": s.printDomain, - "timeSince": s.timeSince, - "trimSpace": strings.TrimSpace, - "escapeURL": url.QueryEscape, + "printDomain": s.printDomain, + "timeSince": s.timeSince, + "trimSpace": strings.TrimSpace, + "escapeURL": url.QueryEscape, + "faviconForURL": s.faviconForURL, } tmplFiles := filepath.Join("files", "*.tmpl.html")@@ -526,6 +545,23 @@ trimmedStr = strings.TrimPrefix(trimmedStr, "http://")
trimmedStr = strings.TrimPrefix(trimmedStr, "https://") return strings.Split(trimmedStr, "/")[0] +} + +// faviconForURL returns the cached favicon data URL for a given URL's domain +func (s *Site) faviconForURL(rawURL string) template.HTML { + domain := s.printDomain(rawURL) + if domain == "" { + return template.HTML("") + } + faviconDataURL := s.faviconFetcher.GetFaviconDataURL(domain) + if faviconDataURL == "" { + return template.HTML("") + } + + imgTag := fmt.Sprintf(`<img src="%s" alt="" style="width: 16px; height: 16px; vertical-align: middle; margin-right: 4px;">`, + html.EscapeString(faviconDataURL)) + + return template.HTML(imgTag) } func (s *Site) timeSince(t time.Time) string {