From 51aa63873681216026d518cde4abeca307818a4b Mon Sep 17 00:00:00 2001 From: Dawid Rycerz Date: Mon, 12 Jan 2026 18:21:12 +0100 Subject: Add infinite posts downloads --- src/loaders/pleroma.ts | 155 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 122 insertions(+), 33 deletions(-) (limited to 'src/loaders') diff --git a/src/loaders/pleroma.ts b/src/loaders/pleroma.ts index 73d11da..7e1ccb8 100644 --- a/src/loaders/pleroma.ts +++ b/src/loaders/pleroma.ts @@ -38,6 +38,37 @@ interface PleromaStatus { visibility: string; } +/** + * Parse the Link header to extract the max_id for the next page + * Link header format: ; rel="next", ; rel="prev" + */ +function parseNextPageMaxId(linkHeader: string | null): string | null { + if (!linkHeader) { + return null; + } + + // Split by comma to get individual links + const links = linkHeader.split(","); + + for (const link of links) { + // Check if this is the "next" rel link + if (link.includes('rel="next"')) { + // Extract URL from angle brackets + const urlMatch = link.match(/<([^>]+)>/); + if (urlMatch?.[1]) { + // Parse the URL to extract max_id parameter + try { + const url = new URL(urlMatch[1]); + const maxId = url.searchParams.get("max_id"); + return maxId; + } catch {} + } + } + } + + return null; +} + async function getAccountId( instanceUrl: string, username: string, @@ -86,50 +117,108 @@ async function fetchAccountStatuses( maxPosts: number, logger: any, ): Promise { - let response: Response | undefined; - let lastError: unknown; + const allStatuses: PleromaStatus[] = []; + let maxId: string | null = null; + let pageCount = 0; + const pageLimit = 40; // Mastodon/Pleroma API max per page + const fetchAll = maxPosts === -1; + + // Fetch pages until we have enough posts or no more pages available + while (fetchAll || allStatuses.length < maxPosts) { + pageCount++; + let response: Response | undefined; + let lastError: unknown; + + // Build URL with pagination parameters + // If fetching all, always use pageLimit; otherwise calculate remaining + const requestLimit = fetchAll ? pageLimit : Math.min(pageLimit, maxPosts - allStatuses.length); + const params = new URLSearchParams({ + limit: String(requestLimit), + exclude_replies: "true", + exclude_reblogs: "true", + }); - // Add retry logic for network issues - for (let attempt = 1; attempt <= 3; attempt++) { - try { - logger.info(`Attempt ${attempt} to fetch statuses...`); + if (maxId) { + params.set("max_id", maxId); + } - const statusesUrl = `${instanceUrl}/api/v1/accounts/${accountId}/statuses?limit=${maxPosts}&exclude_replies=true&exclude_reblogs=true`; + const statusesUrl = `${instanceUrl}/api/v1/accounts/${accountId}/statuses?${params.toString()}`; - // Create timeout controller - const controller = new AbortController(); - const timeoutId = setTimeout(() => controller.abort(), 10000); + // Add retry logic for network issues + for (let attempt = 1; attempt <= 3; attempt++) { + try { + const modeMsg = fetchAll ? " [fetching all posts]" : ` [target: ${maxPosts}]`; + logger.info( + `Attempt ${attempt} to fetch statuses page ${pageCount}${maxId ? ` (max_id: ${maxId})` : ""}${modeMsg}...`, + ); + + // Create timeout controller + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), 10000); + + response = await fetch(statusesUrl, { + headers: { + "User-Agent": "Astro Blog (pleroma-loader)", + }, + signal: controller.signal, + }); - response = await fetch(statusesUrl, { - headers: { - "User-Agent": "Astro Blog (pleroma-loader)", - }, - signal: controller.signal, - }); + clearTimeout(timeoutId); - clearTimeout(timeoutId); + if (response.ok) { + break; // Success, exit retry loop + } + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } catch (error) { + lastError = error; + logger.warn(`Attempt ${attempt} failed: ${error}`); - if (response.ok) { - break; // Success, exit retry loop - } - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } catch (error) { - lastError = error; - logger.warn(`Attempt ${attempt} failed: ${error}`); - - if (attempt < 3) { - logger.info("Retrying in 2 seconds..."); - await new Promise((resolve) => setTimeout(resolve, 2000)); + if (attempt < 3) { + logger.info("Retrying in 2 seconds..."); + await new Promise((resolve) => setTimeout(resolve, 2000)); + } } } - } - if (!response || !response.ok) { - throw new Error(`Failed to fetch statuses after 3 attempts. Last error: ${lastError}`); + if (!response || !response.ok) { + throw new Error(`Failed to fetch statuses after 3 attempts. Last error: ${lastError}`); + } + + const statuses: PleromaStatus[] = await response.json(); + logger.info(`Fetched ${statuses.length} statuses from page ${pageCount}`); + + // If no statuses returned, we've reached the end + if (statuses.length === 0) { + logger.info("No more statuses available"); + break; + } + + // Add statuses to our accumulated list + allStatuses.push(...statuses); + + // Parse Link header to get next page max_id + const linkHeader = response.headers.get("link"); + const nextMaxId = parseNextPageMaxId(linkHeader); + + if (!nextMaxId) { + logger.info("No more pages available (no next link in header)"); + break; + } + + // If the max_id hasn't changed, we're stuck in a loop - break + if (nextMaxId === maxId) { + logger.warn("Pagination returned same max_id, stopping to prevent infinite loop"); + break; + } + + maxId = nextMaxId; } - const statuses: PleromaStatus[] = await response.json(); - return statuses; + const summaryMsg = fetchAll + ? `Total fetched: ${allStatuses.length} statuses (all available) across ${pageCount} page(s)` + : `Total fetched: ${allStatuses.length} statuses (target: ${maxPosts}) across ${pageCount} page(s)`; + logger.info(summaryMsg); + return allStatuses; } function isFilteredStatus(status: PleromaStatus): boolean { -- cgit v1.2.3