--- layout: post title: 利用Go+Github Actions写个定时RSS爬虫 date: 2024-07-27 09:50:01 +0800 category: tech thumb: ARTICLEPICTURES_PATH/1722043714404.jpg tags: [Go, Github Actions, COS, 爬虫] --- 说起这事,还是受一位博友的启发“1900”他的左邻右舍页面很棒,决定模仿一下。我平时也用 Inoreader,但我还是喜欢直接打开博客的感觉,心血来潮,搞。 起初,我打算使用 COS 和 GitHub Actions,但在测试过程中发现 GitHub 的延迟非常高,验证和文件写入速度极慢,频频失败。干脆直接上 GitHub 自产自销。 ## 大致思路 ```plaintext main() │ ├── readFeedsFromGitHub() │ ├── GitHub API 调用 │ │ ├── 读取 rss_feeds.txt 文件 │ │ └── 处理文件报错 │ └── Return │ ├── fetchRSS() │ ├── 遍历 RSS │ │ ├── HTTP GET 请求 │ │ └── 处理请求错误 │ ├── 解析 RSS │ │ ├── 清理 XML 内容中的非法字符 │ │ ├── 提取域名 │ │ └── 格式化并排序 │ └── Return │ └── saveToGitHub() ├── GitHub API 调用 │ ├── 保存到 _data/rss_data.json 供 Jekyll 调用 │ └── 处理错误 └── Return ``` 由于用 Go 搬砖,所有的包、类型和方法均可在 GitHub API 客户端库的第 39 版文档查询 关于 Github API 有一点需要注意,配置好环境变量后,Token 操作仓库需要有一定的权限,务必启用 Read and write permissions 读取和写入权限 ```go go mod init github.com/achuanya/Grab-latest-RSS // Go-GitHub v39 go get github.com/google/go-github/v39/github // RSS 和 Atom feeds 解析库 go get github.com/mmcdole/gofeed // OAuth2 认证和授权 go get golang.org/x/oauth2 ``` ## Go RSS 爬虫 Code ```go package main import ( "bufio" "bytes" "context" "encoding/json" "fmt" "net/http" "net/url" "os" "regexp" "sort" "sync" "time" "github.com/google/go-github/v39/github" "github.com/mmcdole/gofeed" "golang.org/x/oauth2" ) const ( maxRetries = 3 // 最大重试次数 retryInterval = 10 * time.Second // 重试间隔时间 ) type Config struct { GithubToken string // GitHub API 令牌 GithubName string // GitHub 用户名 GithubRepository string // GitHub 仓库名 } // 用于解析 avatar_data.json 文件的结构 type Avatar struct { Name string `json:"name"` // 用户名 Avatar string `json:"avatar"` // 头像 URL } // 爬虫抓取的数据结构 type Article struct { DomainName string `json:"domainName"` // 域名 Name string `json:"name"` // 博客名称 Title string `json:"title"` // 文章标题 Link string `json:"link"` // 文章链接 Date string `json:"date"` // 格式化后的文章发布时间 Avatar string `json:"avatar"` // 头像 URL } // 初始化并返回配置信息 func initConfig() Config { return Config{ GithubToken: os.Getenv("TOKEN"), // 从环境变量中获取 GitHub API 令牌 GithubName: "achuanya", // GitHub 用户名 GithubRepository: "lhasa.github.io", // GitHub 仓库名 } } // 清理 XML 内容中的非法字符 func cleanXMLContent(content string) string { re := regexp.MustCompile(`[\x00-\x1F\x7F-\x9F]`) return re.ReplaceAllString(content, "") } // 尝试解析不同格式的时间字符串 func parseTime(timeStr string) (time.Time, error) { formats := []string{ time.RFC3339, time.RFC3339Nano, time.RFC1123Z, time.RFC1123, } for _, format := range formats { if t, err := time.Parse(format, timeStr); err == nil { return t, nil } } return time.Time{}, fmt.Errorf("unable to parse time: %s", timeStr) } // 将时间格式化为 "January 2, 2006" func formatTime(t time.Time) string { return t.Format("January 2, 2006") } // 从 URL 中提取域名,并添加 https:// 前缀 func extractDomain(urlStr string) (string, error) { u, err := url.Parse(urlStr) if err != nil { return "", err } domain := u.Hostname() protocol := "https://" if u.Scheme != "" { protocol = u.Scheme + "://" } fullURL := protocol + domain return fullURL, nil } // 获取当前的北京时间 func getBeijingTime() time.Time { beijingTimeZone := time.FixedZone("CST", 8*3600) return time.Now().In(beijingTimeZone) } // 记录错误信息到 error.log 文件 func logError(config Config, message string) { logMessage(config, message, "error.log") } // 记录信息到指定的文件 func logMessage(config Config, message string, fileName string) { ctx := context.Background() client := github.NewClient(oauth2.NewClient(ctx, oauth2.StaticTokenSource(&oauth2.Token{ AccessToken: config.GithubToken, }))) filePath := "_data/" + fileName fileContent := []byte(message + "\n\n") file, _, resp, err := client.Repositories.GetContents(ctx, config.GithubName, config.GithubRepository, filePath, nil) if err != nil && resp.StatusCode == http.StatusNotFound { _, _, err := client.Repositories.CreateFile(ctx, config.GithubName, config.GithubRepository, filePath, &github.RepositoryContentFileOptions{ Message: github.String("Create " + fileName), Content: fileContent, Branch: github.String("master"), }) if err != nil { fmt.Printf("error creating %s in GitHub: %v\n", fileName, err) } return } else if err != nil { fmt.Printf("error checking %s in GitHub: %v\n", fileName, err) return } decodedContent, err := file.GetContent() if err != nil { fmt.Printf("error decoding %s content: %v\n", fileName, err) return } updatedContent := append([]byte(decodedContent), fileContent...) _, _, err = client.Repositories.UpdateFile(ctx, config.GithubName, config.GithubRepository, filePath, &github.RepositoryContentFileOptions{ Message: github.String("Update " + fileName), Content: updatedContent, SHA: github.String(*file.SHA), Branch: github.String("master"), }) if err != nil { fmt.Printf("error updating %s in GitHub: %v\n", fileName, err) } } // 从 GitHub 仓库中获取 JSON 文件内容 func fetchFileFromGitHub(config Config, filePath string) (string, error) { ctx := context.Background() client := github.NewClient(oauth2.NewClient(ctx, oauth2.StaticTokenSource(&oauth2.Token{ AccessToken: config.GithubToken, }))) file, _, resp, err := client.Repositories.GetContents(ctx, config.GithubName, config.GithubRepository, filePath, nil) if err != nil { if resp.StatusCode == http.StatusNotFound { return "", fmt.Errorf("file not found: %s", filePath) } return "", fmt.Errorf("error fetching file %s from GitHub: %v", filePath, err) } content, err := file.GetContent() if err != nil { return "", fmt.Errorf("error decoding file %s content: %v", filePath, err) } return content, nil } // 从 GitHub 仓库中读取头像配置 func loadAvatarsFromGitHub(config Config) (map[string]string, error) { content, err := fetchFileFromGitHub(config, "_data/avatar_data.json") if err != nil { return nil, err } var avatars []Avatar if err := json.Unmarshal([]byte(content), &avatars); err != nil { return nil, err } avatarMap := make(map[string]string) for _, a := range avatars { avatarMap[a.Name] = a.Avatar } return avatarMap, nil } // 从 RSS 列表中抓取最新的文章,并按发布时间排序 func fetchRSS(config Config, feeds []string) ([]Article, error) { var articles []Article var mu sync.Mutex // 用于保证并发安全 var wg sync.WaitGroup // 用于等待所有 goroutine 完成 avatars, err := loadAvatarsFromGitHub(config) if err != nil { logError(config, fmt.Sprintf("[%s] [Load avatars error] %v", getBeijingTime().Format("Mon Jan 2 15:04:2006"), err)) return nil, err } fp := gofeed.NewParser() httpClient := &http.Client{ Timeout: 10 * time.Second, } for _, feedURL := range feeds { wg.Add(1) go func(feedURL string) { defer wg.Done() var resp *http.Response var bodyString string var fetchErr error for i := 0; i < maxRetries; i++ { resp, fetchErr = httpClient.Get(feedURL) if fetchErr == nil { bodyBytes := new(bytes.Buffer) bodyBytes.ReadFrom(resp.Body) bodyString = bodyBytes.String() resp.Body.Close() break } logError(config, fmt.Sprintf("[%s] [Get RSS error] %s: Attempt %d/%d: %v", getBeijingTime().Format("Mon Jan 2 15:04:2006"), feedURL, i+1, maxRetries, fetchErr)) time.Sleep(retryInterval) } if fetchErr != nil { logError(config, fmt.Sprintf("[%s] [Failed to fetch RSS] %s: %v", getBeijingTime().Format("Mon Jan 2 15:04:2006"), feedURL, fetchErr)) return } cleanBody := cleanXMLContent(bodyString) var feed *gofeed.Feed var parseErr error for i := 0; i < maxRetries; i++ { feed, parseErr = fp.ParseString(cleanBody) if parseErr == nil { break } logError(config, fmt.Sprintf("[%s] [Parse RSS error] %s: Attempt %d/%d: %v", getBeijingTime().Format("Mon Jan 2 15:04:2006"), feedURL, i+1, maxRetries, parseErr)) time.Sleep(retryInterval) } if parseErr != nil { logError(config, fmt.Sprintf("[%s] [Failed to parse RSS] %s: %v", getBeijingTime().Format("Mon Jan 2 15:04:2006"), feedURL, parseErr)) return } mainSiteURL := feed.Link domainName, err := extractDomain(mainSiteURL) if err != nil { logError(config, fmt.Sprintf("[%s] [Extract domain error] %s: %v", getBeijingTime().Format("Mon Jan 2 15:04:2006"), mainSiteURL, err)) domainName = "unknown" } name := feed.Title avatarURL := avatars[name] if avatarURL == "" { avatarURL = "https://cos.lhasa.icu/LinksAvatar/default.png" } if len(feed.Items) > 0 { item := feed.Items[0] publishedTime, err := parseTime(item.Published) if err != nil && item.Updated != "" { publishedTime, err = parseTime(item.Updated) } if err != nil { logError(config, fmt.Sprintf("[%s] [Getting article time error] %s: %v", getBeijingTime().Format("Mon Jan 2 15:04:2006"), item.Title, err)) publishedTime = time.Now() } originalName := feed.Title // 该长的地方短,该短的地方长 nameMapping := map[string]string{ "obaby@mars": "obaby", "青山小站 | 一个在帝都搬砖的新时代农民工": "青山小站", "Homepage on Miao Yu | 于淼": "于淼", "Homepage on Yihui Xie | 谢益辉": "谢益辉", } validNames := make(map[string]struct{}) for key := range nameMapping { validNames[key] = struct{}{} } _, valid := validNames[originalName] if !valid { for key := range validNames { if key == originalName { logError(config, fmt.Sprintf("[%s] [Name mapping not found] %s", getBeijingTime().Format("Mon Jan 2 15:04:2006"), originalName)) break } } } else { name = nameMapping[originalName] } mu.Lock() articles = append(articles, Article{ DomainName: domainName, Name: name, Title: item.Title, Link: item.Link, Avatar: avatarURL, Date: formatTime(publishedTime), }) mu.Unlock() } }(feedURL) } wg.Wait() sort.Slice(articles, func(i, j int) bool { date1, _ := time.Parse("January 2, 2006", articles[i].Date) date2, _ := time.Parse("January 2, 2006", articles[j].Date) return date1.After(date2) }) return articles, nil } // 将爬虫抓取的数据保存到 GitHub func saveToGitHub(config Config, data []Article) error { ctx := context.Background() client := github.NewClient(oauth2.NewClient(ctx, oauth2.StaticTokenSource(&oauth2.Token{ AccessToken: config.GithubToken, }))) manualArticles := []Article{ { DomainName: "https://foreverblog.cn", Name: "十年之约", Title: "穿梭虫洞-随机访问十年之约友链博客", Link: "https://foreverblog.cn/go.html", Date: "January 01, 2000", Avatar: "https://cos.lhasa.icu/LinksAvatar/foreverblog.cn.png", }, { DomainName: "https://www.travellings.cn", Name: "开往", Title: "开往-友链接力", Link: "https://www.travellings.cn/go.html", Date: "January 01, 2000", Avatar: "https://cos.lhasa.icu/LinksAvatar/www.travellings.png", }, } data = append(data, manualArticles...) jsonData, err := json.Marshal(data) if err != nil { return err } filePath := "_data/rss_data.json" file, _, resp, err := client.Repositories.GetContents(ctx, config.GithubName, config.GithubRepository, filePath, nil) if err != nil && resp.StatusCode == http.StatusNotFound { _, _, err := client.Repositories.CreateFile(ctx, config.GithubName, config.GithubRepository, filePath, &github.RepositoryContentFileOptions{ Message: github.String("Create rss_data.json"), Content: jsonData, Branch: github.String("master"), }) if err != nil { return fmt.Errorf("error creating rss_data.json in GitHub: %v", err) } return nil } else if err != nil { return fmt.Errorf("error checking rss_data.json in GitHub: %v", err) } _, _, err = client.Repositories.UpdateFile(ctx, config.GithubName, config.GithubRepository, filePath, &github.RepositoryContentFileOptions{ Message: github.String("Update rss_data.json"), Content: jsonData, SHA: github.String(*file.SHA), Branch: github.String("master"), }) if err != nil { return fmt.Errorf("error updating rss_data.json in GitHub: %v", err) } return nil } // 从 GitHub 仓库中获取 RSS 文件 func readFeedsFromGitHub(config Config) ([]string, error) { ctx := context.Background() client := github.NewClient(oauth2.NewClient(ctx, oauth2.StaticTokenSource(&oauth2.Token{ AccessToken: config.GithubToken, }))) filePath := "_data/rss_feeds.txt" file, _, resp, err := client.Repositories.GetContents(ctx, config.GithubName, config.GithubRepository, filePath, nil) if err != nil && resp.StatusCode == http.StatusNotFound { errMsg := fmt.Sprintf("Error: %s not found in GitHub repository", filePath) logError(config, fmt.Sprintf("[%s] [Read RSS file error] %v", getBeijingTime().Format("Mon Jan 2 15:04:2006"), errMsg)) return nil, fmt.Errorf(errMsg) } else if err != nil { errMsg := fmt.Sprintf("Error fetching %s from GitHub: %v", filePath, err) logError(config, fmt.Sprintf("[%s] [Read RSS file error] %v", getBeijingTime().Format("Mon Jan 2 15:04:2006"), errMsg)) return nil, fmt.Errorf(errMsg) } content, err := file.GetContent() if err != nil { errMsg := fmt.Sprintf("Error decoding %s content: %v", filePath, err) logError(config, fmt.Sprintf("[%s] [Read RSS file error] %v", getBeijingTime().Format("Mon Jan 2 15:04:2006"), errMsg)) return nil, fmt.Errorf(errMsg) } var feeds []string scanner := bufio.NewScanner(bytes.NewReader([]byte(content))) for scanner.Scan() { feeds = append(feeds, scanner.Text()) } if err := scanner.Err(); err != nil { errMsg := fmt.Sprintf("Error reading RSS file content: %v", err) logError(config, fmt.Sprintf("[%s] [Read RSS file error] %v", getBeijingTime().Format("Mon Jan 2 15:04:2006"), errMsg)) return nil, fmt.Errorf(errMsg) } return feeds, nil } func main() { config := initConfig() // 从 GitHub 仓库中读取 RSS feeds 列表 rssFeeds, err := readFeedsFromGitHub(config) if err != nil { logError(config, fmt.Sprintf("[%s] [Read RSS feeds error] %v", getBeijingTime().Format("Mon Jan 2 15:04:2006"), err)) fmt.Printf("Error reading RSS feeds from GitHub: %v\n", err) return } // 抓取 RSS feeds articles, err := fetchRSS(config, rssFeeds) if err != nil { logError(config, fmt.Sprintf("[%s] [Fetch RSS error] %v", getBeijingTime().Format("Mon Jan 2 15:04:2006"), err)) fmt.Printf("Error fetching RSS feeds: %v\n", err) return } // 将抓取的数据保存到 GitHub 仓库 err = saveToGitHub(config, articles) if err != nil { logError(config, fmt.Sprintf("[%s] [Save data to GitHub error] %v", getBeijingTime().Format("Mon Jan 2 15:04:2006"), err)) fmt.Printf("Error saving data to GitHub: %v\n", err) return } fmt.Println("Stop writing code and go ride a road bike now!") } ``` ### Go 生成的 json 数据 ```json [ { "domainName": "https://yihui.org", "name": "谢益辉", "title": "Rd2roxygen", "link": "https://yihui.org/rd2roxygen/", "date": "April 14, 2024", "avatar": "https://cos.lhasa.icu/LinksAvatar/yihui.org.png" }, { "domainName": "https://www.laruence.com", "name": "风雪之隅", "title": "PHP8.0的Named Parameter", "link": "https://www.laruence.com/2022/05/10/6192.html", "date": "May 10, 2022", "avatar": "https://cos.lhasa.icu/LinksAvatar/www.laruence.com.png" } ] ``` ### Go 生成的日志 ```log [Sat Jul 27 08:42:2024] [Parse RSS error] https://lhasa.icu: Failed to detect feed type [Sat Jul 27 08:41:2024] [Get RSS error] https://lhasa.icu: Get "https://lhasa.icu": net/http: TLS handshake timeout ``` ## Github Actons 1h/次 ```yml name: ScheduledRssRawler on: schedule: - cron: '0 * * * *' workflow_dispatch: jobs: build: runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v3 - name: Set up Go uses: actions/setup-go@v3 with: go-version: '1.22.5' - name: Install dependencies run: go mod tidy working-directory: ./api - name: Build run: go build -o main working-directory: ./api - name: Run Go program env: TOKEN: ${{ secrets.KEY }} run: ./main working-directory: ./api ``` 腾讯 COS 也写了一份,Github 有延迟问题就没用,也能用,逻辑上和 Go 是没啥区别 Grab-latest-RSS:https://github.com/achuanya/Grab-latest-RSS COS Go SDK:https://cloud.tencent.com/document/product/436/31215 [效果页:https://lhasa.icu/links.html](https://lhasa.icu/links.html)