import type { Loader } from "astro/loaders"; import { marked } from "marked"; import TurndownService from "turndown"; interface PleromaFeedConfig { instanceUrl: string; username: string; maxPosts?: number; accountId?: string; // Optional: if provided, skips account lookup allowedTags?: string[]; // Optional: if provided, only posts with these tags are included } interface PleromaAccount { id: string; username: string; acct: string; display_name: string; url: string; } interface PleromaMediaAttachment { id: string; type: "image" | "video" | "gifv" | "audio" | "unknown"; url: string; preview_url: string; description?: string; } interface PleromaStatus { id: string; created_at: string; content: string; url: string; reblog: PleromaStatus | null; in_reply_to_id: string | null; sensitive: boolean; media_attachments: PleromaMediaAttachment[]; visibility: string; } /** * Parse the Link header to extract the max_id for the next page * Link header format: ; rel="next", ; rel="prev" */ function parseNextPageMaxId(linkHeader: string | null): string | null { if (!linkHeader) { return null; } // Split by comma to get individual links const links = linkHeader.split(","); for (const link of links) { // Check if this is the "next" rel link if (link.includes('rel="next"')) { // Extract URL from angle brackets const urlMatch = link.match(/<([^>]+)>/); if (urlMatch?.[1]) { // Parse the URL to extract max_id parameter try { const url = new URL(urlMatch[1]); const maxId = url.searchParams.get("max_id"); return maxId; } catch {} } } } return null; } async function getAccountId( instanceUrl: string, username: string, logger: any, ): Promise { try { const searchUrl = `${instanceUrl}/api/v1/accounts/search?q=${encodeURIComponent(username)}&limit=1`; logger.info(`Looking up account ID for username: ${username}`); const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), 10000); const response = await fetch(searchUrl, { headers: { "User-Agent": "Astro Blog (pleroma-loader)", }, signal: controller.signal, }); clearTimeout(timeoutId); if (!response.ok) { logger.warn(`Failed to search for account: HTTP ${response.status}`); return null; } const accounts: PleromaAccount[] = await response.json(); if (accounts.length === 0 || !accounts[0]) { logger.warn(`No account found for username: ${username}`); return null; } const account = accounts[0]; logger.info(`Found account ID: ${account.id} for @${account.acct}`); return account.id; } catch (error) { logger.warn(`Failed to lookup account ID: ${error}`); return null; } } async function fetchAccountStatuses( instanceUrl: string, accountId: string, maxPosts: number, logger: any, ): Promise { const allStatuses: PleromaStatus[] = []; let maxId: string | null = null; let pageCount = 0; const pageLimit = 40; // Mastodon/Pleroma API max per page const fetchAll = maxPosts === -1; // Fetch pages until we have enough posts or no more pages available while (fetchAll || allStatuses.length < maxPosts) { pageCount++; let response: Response | undefined; let lastError: unknown; // Build URL with pagination parameters // If fetching all, always use pageLimit; otherwise calculate remaining const requestLimit = fetchAll ? pageLimit : Math.min(pageLimit, maxPosts - allStatuses.length); const params = new URLSearchParams({ limit: String(requestLimit), exclude_replies: "true", exclude_reblogs: "true", }); if (maxId) { params.set("max_id", maxId); } const statusesUrl = `${instanceUrl}/api/v1/accounts/${accountId}/statuses?${params.toString()}`; // Add retry logic for network issues for (let attempt = 1; attempt <= 3; attempt++) { try { const modeMsg = fetchAll ? " [fetching all posts]" : ` [target: ${maxPosts}]`; logger.info( `Attempt ${attempt} to fetch statuses page ${pageCount}${maxId ? ` (max_id: ${maxId})` : ""}${modeMsg}...`, ); // Create timeout controller const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), 10000); response = await fetch(statusesUrl, { headers: { "User-Agent": "Astro Blog (pleroma-loader)", }, signal: controller.signal, }); clearTimeout(timeoutId); if (response.ok) { break; // Success, exit retry loop } throw new Error(`HTTP ${response.status}: ${response.statusText}`); } catch (error) { lastError = error; logger.warn(`Attempt ${attempt} failed: ${error}`); if (attempt < 3) { logger.info("Retrying in 2 seconds..."); await new Promise((resolve) => setTimeout(resolve, 2000)); } } } if (!response || !response.ok) { throw new Error(`Failed to fetch statuses after 3 attempts. Last error: ${lastError}`); } const statuses: PleromaStatus[] = await response.json(); logger.info(`Fetched ${statuses.length} statuses from page ${pageCount}`); // If no statuses returned, we've reached the end if (statuses.length === 0) { logger.info("No more statuses available"); break; } // Add statuses to our accumulated list allStatuses.push(...statuses); // Parse Link header to get next page max_id const linkHeader = response.headers.get("link"); const nextMaxId = parseNextPageMaxId(linkHeader); if (!nextMaxId) { logger.info("No more pages available (no next link in header)"); break; } // If the max_id hasn't changed, we're stuck in a loop - break if (nextMaxId === maxId) { logger.warn("Pagination returned same max_id, stopping to prevent infinite loop"); break; } maxId = nextMaxId; } const summaryMsg = fetchAll ? `Total fetched: ${allStatuses.length} statuses (all available) across ${pageCount} page(s)` : `Total fetched: ${allStatuses.length} statuses (target: ${maxPosts}) across ${pageCount} page(s)`; logger.info(summaryMsg); return allStatuses; } function isFilteredStatus(status: PleromaStatus): boolean { // Filter out boosts/reblogs (already handled by API parameter, but double-check) if (status.reblog) { return true; } // Filter out replies (already handled by API parameter, but double-check) if (status.in_reply_to_id) { return true; } // Filter out NSFW/sensitive content if (status.sensitive) { return true; } return false; } function extractHashtags(htmlContent: string): string[] { // Extract hashtags from HTML spans and plain text const hashtagPattern = /#(\w+)/gi; const matches = htmlContent.match(hashtagPattern); return matches ? [...new Set(matches.map((tag) => tag.toLowerCase()))] : []; } function hasAllowedTag(status: PleromaStatus, allowedTags: string[]): boolean { if (!allowedTags || allowedTags.length === 0) { return true; // No filtering if no tags specified } const content = status.content || ""; const hashtags = extractHashtags(content); const normalizedAllowedTags = allowedTags.map((tag) => tag.toLowerCase().replace(/^#/, "")); const normalizedHashtags = hashtags.map((tag) => tag.toLowerCase().replace(/^#/, "")); return normalizedHashtags.some((tag) => normalizedAllowedTags.includes(tag)); } function cleanContent(htmlContent: string): string { const turndownService = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced", }); // Remove or replace common Pleroma/Mastodon elements const cleanedContent = htmlContent .replace(/]*>/gi, "") // Remove mention spans but keep content .replace(/<\/span>/gi, "") .replace(/]*>/gi, "") // Remove hashtag spans but keep content .replace(/]*>.*?<\/span>/gi, "") // Remove ellipsis .replace(/]*>.*?<\/span>/gi, ""); // Remove invisible text // Convert to markdown const markdown = turndownService.turndown(cleanedContent); // Clean up extra whitespace return markdown.trim().replace(/\n\s*\n\s*\n/g, "\n\n"); } function markdownToHtml(markdown: string): string { // Configure marked options for safe rendering marked.setOptions({ breaks: true, // Convert line breaks to
gfm: true, // GitHub flavored markdown }); // Convert markdown to HTML const html = marked.parse(markdown); // Return as string (marked.parse can return string or Promise) return typeof html === "string" ? html : ""; } function extractTitle(content: string): string { // Extract first line or first sentence as title const firstLine = content.split("\n")[0]; if (!firstLine) return "Micro post"; const firstSentence = firstLine.split(/[.!?]/)[0]; if (!firstSentence) return "Micro post"; // Limit title length and clean it up const title = (firstSentence.length > 60 ? `${firstSentence.substring(0, 57)}...` : firstSentence) .replace(/[#*_`]/g, "") // Remove markdown formatting .trim(); return title || "Micro post"; } export function pleromaLoader(config: PleromaFeedConfig): Loader { return { name: "pleroma-loader", load: async ({ store, logger }) => { try { const { instanceUrl, username, maxPosts = 20, accountId: configAccountId } = config; logger.info(`Fetching Pleroma posts via API for user: ${username}`); // Get account ID (use provided one or lookup by username) let accountId: string | undefined = configAccountId; if (!accountId) { const lookedUpAccountId = await getAccountId(instanceUrl, username, logger); if (!lookedUpAccountId) { logger.warn("Failed to get account ID. Continuing without Pleroma posts..."); store.clear(); return; } accountId = lookedUpAccountId; } // Fetch statuses from API const statuses = await fetchAccountStatuses(instanceUrl, accountId, maxPosts, logger); logger.info(`Fetched ${statuses.length} statuses from API`); // Filter statuses const validStatuses = statuses.filter((status) => { if (isFilteredStatus(status)) return false; if (config.allowedTags && !hasAllowedTag(status, config.allowedTags)) return false; return true; }); logger.info(`After filtering: ${validStatuses.length} valid posts`); // Clear existing entries store.clear(); // Process each status for (const status of validStatuses) { try { const content = status.content || ""; const cleanedContent = cleanContent(content); const title = extractTitle(cleanedContent); // Extract post ID from status const postId = status.id; // Use status URL as source const sourceUrl = status.url; // Extract image attachments only const attachments = status.media_attachments .filter((attachment) => attachment.type === "image") .map((attachment) => ({ url: attachment.url, type: `image/${attachment.url.split(".").pop() || "jpeg"}`, })); // Create note entry store.set({ id: `pleroma-${postId}`, data: { title, description: cleanedContent.substring(0, 160) + (cleanedContent.length > 160 ? "..." : ""), publishDate: new Date(status.created_at), sourceUrl, attachments, }, body: cleanedContent, rendered: { html: markdownToHtml(cleanedContent), }, }); logger.info(`Processed post: ${title.substring(0, 50)}...`); } catch (error) { logger.warn(`Failed to process status ${status.id}: ${error}`); } } logger.info(`Successfully loaded ${validStatuses.length} Pleroma posts`); } catch (error) { logger.warn(`Pleroma loader failed: ${error}`); logger.info("Continuing build without Pleroma posts..."); // Don't throw error to prevent build failure store.clear(); } }, }; }