2021-11-20 21:26:39 +00:00
|
|
|
/*
|
|
|
|
* Ekster is a microsub server
|
|
|
|
* Copyright (c) 2021 The Ekster authors
|
|
|
|
*
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
2019-03-20 10:11:17 +00:00
|
|
|
// Package fetch provides an API for fetching information about urls.
|
2018-08-05 11:45:12 +00:00
|
|
|
package fetch
|
2018-02-16 21:13:01 +00:00
|
|
|
|
|
|
|
import (
|
2018-08-18 17:17:07 +00:00
|
|
|
"bytes"
|
2018-04-07 14:36:27 +00:00
|
|
|
"encoding/hex"
|
2018-02-16 21:13:01 +00:00
|
|
|
"encoding/json"
|
2021-06-05 19:51:33 +00:00
|
|
|
"fmt"
|
2018-04-07 21:41:16 +00:00
|
|
|
"io"
|
2018-04-07 18:50:07 +00:00
|
|
|
"io/ioutil"
|
2018-02-16 21:13:01 +00:00
|
|
|
"log"
|
|
|
|
"net/url"
|
|
|
|
"strings"
|
2018-02-19 20:02:47 +00:00
|
|
|
"time"
|
2018-02-16 21:13:01 +00:00
|
|
|
|
2018-08-18 17:17:07 +00:00
|
|
|
"golang.org/x/net/html"
|
|
|
|
"golang.org/x/net/html/atom"
|
|
|
|
|
2018-08-05 10:15:59 +00:00
|
|
|
"p83.nl/go/ekster/pkg/jf2"
|
|
|
|
"p83.nl/go/ekster/pkg/jsonfeed"
|
2018-07-28 15:52:59 +00:00
|
|
|
"p83.nl/go/ekster/pkg/microsub"
|
2020-07-27 20:09:02 +00:00
|
|
|
"p83.nl/go/ekster/pkg/rss"
|
2018-07-28 15:52:59 +00:00
|
|
|
|
2018-02-16 21:13:01 +00:00
|
|
|
"willnorris.com/go/microformats"
|
|
|
|
)
|
|
|
|
|
2019-03-20 10:11:17 +00:00
|
|
|
// FeedHeader returns a new microsub.Feed with the information parsed from body.
|
2021-10-30 19:09:27 +00:00
|
|
|
func FeedHeader(fetcher Fetcher, fetchURL, contentType string, body io.Reader) (microsub.Feed, error) {
|
2018-04-08 09:44:57 +00:00
|
|
|
log.Printf("ProcessContent %s\n", fetchURL)
|
|
|
|
log.Println("Found " + contentType)
|
|
|
|
|
|
|
|
feed := microsub.Feed{}
|
|
|
|
|
|
|
|
u, _ := url.Parse(fetchURL)
|
|
|
|
|
|
|
|
if strings.HasPrefix(contentType, "text/html") {
|
|
|
|
data := microformats.Parse(body, u)
|
2018-12-08 15:56:08 +00:00
|
|
|
author, ok := jf2.SimplifyMicroformatDataAuthor(data)
|
|
|
|
if !ok {
|
|
|
|
if strings.HasPrefix(author.URL, "http") {
|
2021-10-30 19:09:27 +00:00
|
|
|
resp, err := fetcher.Fetch(author.URL)
|
2018-12-08 15:56:08 +00:00
|
|
|
if err != nil {
|
|
|
|
return feed, err
|
2018-04-08 09:44:57 +00:00
|
|
|
}
|
2018-12-08 15:56:08 +00:00
|
|
|
defer resp.Body.Close()
|
2021-05-11 15:23:25 +00:00
|
|
|
u, _ := url.Parse(author.URL)
|
2018-04-08 09:44:57 +00:00
|
|
|
|
2018-12-08 15:56:08 +00:00
|
|
|
md := microformats.Parse(resp.Body, u)
|
|
|
|
|
|
|
|
author, ok = jf2.SimplifyMicroformatDataAuthor(md)
|
2021-11-20 20:26:06 +00:00
|
|
|
if !ok {
|
|
|
|
log.Println("Could not simplify the author")
|
|
|
|
}
|
2018-12-08 15:56:08 +00:00
|
|
|
}
|
2018-04-08 09:44:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
feed.Type = "feed"
|
|
|
|
feed.URL = fetchURL
|
2018-12-08 15:56:08 +00:00
|
|
|
feed.Name = author.Name
|
|
|
|
feed.Photo = author.Photo
|
2018-04-08 09:44:57 +00:00
|
|
|
} else if strings.HasPrefix(contentType, "application/json") { // json feed?
|
2018-12-18 18:40:56 +00:00
|
|
|
jfeed, err := jsonfeed.Parse(body)
|
2018-04-08 09:44:57 +00:00
|
|
|
if err != nil {
|
|
|
|
log.Printf("Error while parsing json feed: %s\n", err)
|
|
|
|
return feed, err
|
|
|
|
}
|
|
|
|
|
|
|
|
feed.Type = "feed"
|
2018-04-08 11:19:04 +00:00
|
|
|
feed.Name = jfeed.Title
|
|
|
|
if feed.Name == "" {
|
|
|
|
feed.Name = jfeed.Author.Name
|
|
|
|
}
|
|
|
|
|
2018-04-08 09:44:57 +00:00
|
|
|
feed.URL = jfeed.FeedURL
|
2018-04-08 11:19:04 +00:00
|
|
|
|
2018-04-08 09:44:57 +00:00
|
|
|
if feed.URL == "" {
|
|
|
|
feed.URL = fetchURL
|
|
|
|
}
|
2018-04-08 11:19:04 +00:00
|
|
|
feed.Photo = jfeed.Icon
|
2018-04-08 09:44:57 +00:00
|
|
|
|
|
|
|
if feed.Photo == "" {
|
2018-04-08 11:19:04 +00:00
|
|
|
feed.Photo = jfeed.Author.Avatar
|
2018-04-08 09:44:57 +00:00
|
|
|
}
|
2018-04-08 11:19:04 +00:00
|
|
|
|
|
|
|
feed.Author.Type = "card"
|
|
|
|
feed.Author.Name = jfeed.Author.Name
|
|
|
|
feed.Author.URL = jfeed.Author.URL
|
|
|
|
feed.Author.Photo = jfeed.Author.Avatar
|
2018-04-08 09:44:57 +00:00
|
|
|
} else if strings.HasPrefix(contentType, "text/xml") || strings.HasPrefix(contentType, "application/rss+xml") || strings.HasPrefix(contentType, "application/atom+xml") || strings.HasPrefix(contentType, "application/xml") {
|
|
|
|
body, err := ioutil.ReadAll(body)
|
|
|
|
if err != nil {
|
|
|
|
log.Printf("Error while parsing rss/atom feed: %s\n", err)
|
|
|
|
return feed, err
|
|
|
|
}
|
|
|
|
xfeed, err := rss.Parse(body)
|
|
|
|
if err != nil {
|
|
|
|
log.Printf("Error while parsing rss/atom feed: %s\n", err)
|
|
|
|
return feed, err
|
|
|
|
}
|
|
|
|
|
|
|
|
feed.Type = "feed"
|
|
|
|
feed.Name = xfeed.Title
|
2018-04-11 20:06:13 +00:00
|
|
|
feed.URL = fetchURL
|
2018-04-08 09:44:57 +00:00
|
|
|
feed.Description = xfeed.Description
|
|
|
|
feed.Photo = xfeed.Image.URL
|
|
|
|
} else {
|
|
|
|
log.Printf("Unknown Content-Type: %s\n", contentType)
|
|
|
|
}
|
2018-04-11 19:56:40 +00:00
|
|
|
log.Println("Found feed: ", feed)
|
2018-04-08 09:44:57 +00:00
|
|
|
return feed, nil
|
|
|
|
}
|
|
|
|
|
2019-03-20 10:11:17 +00:00
|
|
|
// FeedItems returns the items from the url, parsed from body.
|
2021-10-30 19:09:27 +00:00
|
|
|
func FeedItems(fetcher Fetcher, fetchURL, contentType string, body io.Reader) ([]microsub.Item, error) {
|
2018-04-07 23:26:51 +00:00
|
|
|
log.Printf("ProcessContent %s\n", fetchURL)
|
|
|
|
log.Println("Found " + contentType)
|
|
|
|
|
2018-04-07 23:11:40 +00:00
|
|
|
items := []microsub.Item{}
|
|
|
|
|
2018-04-07 18:50:07 +00:00
|
|
|
u, _ := url.Parse(fetchURL)
|
|
|
|
|
|
|
|
if strings.HasPrefix(contentType, "text/html") {
|
2018-04-07 21:41:16 +00:00
|
|
|
data := microformats.Parse(body, u)
|
2018-04-07 14:36:27 +00:00
|
|
|
|
2018-12-08 15:56:08 +00:00
|
|
|
results := jf2.SimplifyMicroformatDataItems(data)
|
2018-04-07 14:36:27 +00:00
|
|
|
|
2018-04-07 18:50:07 +00:00
|
|
|
// Filter items with "published" date
|
|
|
|
for _, r := range results {
|
2018-12-08 15:56:08 +00:00
|
|
|
if r.UID != "" {
|
|
|
|
r.ID = hex.EncodeToString([]byte(r.UID))
|
|
|
|
} else if r.URL != "" {
|
|
|
|
r.ID = hex.EncodeToString([]byte(r.URL))
|
2018-04-07 18:50:07 +00:00
|
|
|
} else {
|
2018-04-25 19:52:04 +00:00
|
|
|
continue
|
2018-04-07 18:50:07 +00:00
|
|
|
}
|
|
|
|
|
2018-12-08 15:56:08 +00:00
|
|
|
items = append(items, r)
|
2018-04-07 14:36:27 +00:00
|
|
|
}
|
2021-06-05 19:51:33 +00:00
|
|
|
} else if strings.HasPrefix(contentType, "application/json") && strings.HasPrefix(contentType, "application/feed+json") { // json feed?
|
2018-08-04 20:43:57 +00:00
|
|
|
var feed jsonfeed.Feed
|
2021-06-05 19:51:33 +00:00
|
|
|
err := json.NewDecoder(body).Decode(&feed)
|
2018-04-07 18:50:07 +00:00
|
|
|
if err != nil {
|
2021-06-05 19:51:33 +00:00
|
|
|
return items, fmt.Errorf("could not parse as jsonfeed: %v", err)
|
2018-04-07 18:50:07 +00:00
|
|
|
}
|
2018-04-08 08:43:06 +00:00
|
|
|
|
2018-04-11 16:47:57 +00:00
|
|
|
author := µsub.Card{}
|
2018-04-08 08:43:06 +00:00
|
|
|
author.Type = "card"
|
|
|
|
author.Name = feed.Author.Name
|
|
|
|
author.URL = feed.Author.URL
|
|
|
|
author.Photo = feed.Author.Avatar
|
|
|
|
|
2018-04-08 08:47:46 +00:00
|
|
|
if author.Photo == "" {
|
|
|
|
author.Photo = feed.Icon
|
|
|
|
}
|
|
|
|
|
2018-04-07 18:50:07 +00:00
|
|
|
for _, feedItem := range feed.Items {
|
|
|
|
var item microsub.Item
|
2018-04-08 19:07:51 +00:00
|
|
|
item.Type = "entry"
|
2018-04-07 18:50:07 +00:00
|
|
|
item.Name = feedItem.Title
|
2018-04-11 16:47:57 +00:00
|
|
|
item.Content = µsub.Content{}
|
2018-04-07 18:50:07 +00:00
|
|
|
item.Content.HTML = feedItem.ContentHTML
|
|
|
|
item.Content.Text = feedItem.ContentText
|
|
|
|
item.URL = feedItem.URL
|
2018-04-09 17:20:32 +00:00
|
|
|
item.ID = hex.EncodeToString([]byte(feedItem.ID))
|
2018-04-07 18:50:07 +00:00
|
|
|
item.Published = feedItem.DatePublished
|
2018-04-08 19:07:51 +00:00
|
|
|
|
2018-04-11 16:47:57 +00:00
|
|
|
itemAuthor := µsub.Card{}
|
2018-04-08 19:07:51 +00:00
|
|
|
itemAuthor.Type = "card"
|
|
|
|
itemAuthor.Name = feedItem.Author.Name
|
|
|
|
itemAuthor.URL = feedItem.Author.URL
|
|
|
|
itemAuthor.Photo = feedItem.Author.Avatar
|
|
|
|
if itemAuthor.URL != "" {
|
|
|
|
item.Author = itemAuthor
|
|
|
|
} else {
|
|
|
|
item.Author = author
|
|
|
|
}
|
2018-04-08 18:47:43 +00:00
|
|
|
item.Photo = []string{feedItem.Image}
|
2018-04-07 23:11:40 +00:00
|
|
|
items = append(items, item)
|
2018-04-07 18:50:07 +00:00
|
|
|
}
|
2018-05-22 19:12:00 +00:00
|
|
|
} else if strings.HasPrefix(contentType, "text/xml") || strings.HasPrefix(contentType, "application/rss+xml") || strings.HasPrefix(contentType, "application/atom+xml") || strings.HasPrefix(contentType, "application/xml") {
|
2018-04-07 21:41:16 +00:00
|
|
|
body, err := ioutil.ReadAll(body)
|
2018-04-07 18:50:07 +00:00
|
|
|
if err != nil {
|
2021-06-05 19:51:33 +00:00
|
|
|
return items, fmt.Errorf("could not read feed for rss/atom: %v", err)
|
2018-04-07 18:50:07 +00:00
|
|
|
}
|
|
|
|
feed, err := rss.Parse(body)
|
|
|
|
if err != nil {
|
2021-06-05 19:51:33 +00:00
|
|
|
return items, fmt.Errorf("while parsing rss/atom feed: %v", err)
|
2018-04-07 14:36:27 +00:00
|
|
|
}
|
|
|
|
|
2018-08-18 17:17:07 +00:00
|
|
|
baseURL, _ := url.Parse(fetchURL)
|
|
|
|
|
2018-04-07 18:50:07 +00:00
|
|
|
for _, feedItem := range feed.Items {
|
|
|
|
var item microsub.Item
|
2018-04-11 16:32:46 +00:00
|
|
|
item.Type = "entry"
|
2018-04-07 18:50:07 +00:00
|
|
|
item.Name = feedItem.Title
|
2018-04-11 16:47:57 +00:00
|
|
|
item.Content = µsub.Content{}
|
2018-04-10 19:02:16 +00:00
|
|
|
if len(feedItem.Content) > 0 {
|
2018-08-18 17:17:07 +00:00
|
|
|
item.Content.HTML = expandHref(feedItem.Content, baseURL)
|
2018-08-19 15:00:09 +00:00
|
|
|
}
|
|
|
|
if len(feedItem.Summary) > 0 {
|
2018-08-19 19:35:26 +00:00
|
|
|
if len(item.Content.HTML) == 0 {
|
|
|
|
item.Content.HTML = feedItem.Summary
|
|
|
|
}
|
2018-04-10 19:02:16 +00:00
|
|
|
}
|
2018-04-07 18:50:07 +00:00
|
|
|
item.URL = feedItem.Link
|
2018-04-08 13:30:57 +00:00
|
|
|
if feedItem.ID == "" {
|
2018-04-09 17:20:32 +00:00
|
|
|
item.ID = hex.EncodeToString([]byte(feedItem.Link))
|
2018-04-08 13:30:57 +00:00
|
|
|
} else {
|
2018-04-09 17:20:32 +00:00
|
|
|
item.ID = hex.EncodeToString([]byte(feedItem.ID))
|
2018-04-08 13:30:57 +00:00
|
|
|
}
|
2018-05-22 19:15:40 +00:00
|
|
|
|
|
|
|
itemAuthor := µsub.Card{}
|
|
|
|
itemAuthor.Type = "card"
|
|
|
|
itemAuthor.Name = feed.Title
|
|
|
|
itemAuthor.URL = feed.Link
|
|
|
|
itemAuthor.Photo = feed.Image.URL
|
|
|
|
item.Author = itemAuthor
|
|
|
|
|
2018-04-08 00:04:38 +00:00
|
|
|
item.Published = feedItem.Date.Format(time.RFC3339)
|
2018-04-07 23:11:40 +00:00
|
|
|
items = append(items, item)
|
2018-04-07 18:12:57 +00:00
|
|
|
}
|
2018-04-07 18:50:07 +00:00
|
|
|
} else {
|
2021-06-05 19:51:33 +00:00
|
|
|
return items, fmt.Errorf("unknown content-type %s for url %s", contentType, fetchURL)
|
2018-04-07 18:12:57 +00:00
|
|
|
}
|
2018-04-11 16:18:54 +00:00
|
|
|
|
|
|
|
for i, v := range items {
|
|
|
|
// Clear type of author, when other fields also aren't set
|
2018-04-11 17:49:49 +00:00
|
|
|
if v.Author != nil && v.Author.Name == "" && v.Author.Photo == "" && v.Author.URL == "" {
|
2018-08-29 19:22:03 +00:00
|
|
|
v.Author = nil
|
2018-04-11 16:18:54 +00:00
|
|
|
items[i] = v
|
|
|
|
}
|
|
|
|
}
|
2018-07-05 20:52:19 +00:00
|
|
|
|
2018-04-07 23:11:40 +00:00
|
|
|
return items, nil
|
|
|
|
}
|
2018-08-18 17:17:07 +00:00
|
|
|
|
|
|
|
// expandHref expands relative URLs in a.href and img.src attributes to be absolute URLs.
|
|
|
|
func expandHref(s string, base *url.URL) string {
|
|
|
|
var buf bytes.Buffer
|
|
|
|
|
|
|
|
node, _ := html.Parse(strings.NewReader(s))
|
|
|
|
|
|
|
|
for c := node.FirstChild; c != nil; c = c.NextSibling {
|
|
|
|
expandHrefRec(c, base)
|
|
|
|
}
|
|
|
|
|
|
|
|
html.Render(&buf, node)
|
|
|
|
|
|
|
|
return buf.String()
|
|
|
|
}
|
|
|
|
|
|
|
|
func getAttrPtr(node *html.Node, name string) *string {
|
|
|
|
if node == nil {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
for i, attr := range node.Attr {
|
|
|
|
if strings.EqualFold(attr.Key, name) {
|
|
|
|
return &node.Attr[i].Val
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func isAtom(node *html.Node, atoms ...atom.Atom) bool {
|
|
|
|
if node == nil {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
for _, atom := range atoms {
|
|
|
|
if atom == node.DataAtom {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
func expandHrefRec(node *html.Node, base *url.URL) {
|
|
|
|
if isAtom(node, atom.A) {
|
|
|
|
href := getAttrPtr(node, "href")
|
|
|
|
if href != nil {
|
|
|
|
if urlParsed, err := url.Parse(*href); err == nil {
|
|
|
|
urlParsed = base.ResolveReference(urlParsed)
|
|
|
|
*href = urlParsed.String()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
if isAtom(node, atom.Img) {
|
|
|
|
href := getAttrPtr(node, "src")
|
|
|
|
if href != nil {
|
|
|
|
if urlParsed, err := url.Parse(*href); err == nil {
|
|
|
|
urlParsed = base.ResolveReference(urlParsed)
|
|
|
|
*href = urlParsed.String()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
for c := node.FirstChild; c != nil; c = c.NextSibling {
|
|
|
|
expandHrefRec(c, base)
|
|
|
|
}
|
|
|
|
}
|