import type { Loader } from "astro/loaders"; import { marked } from "marked"; import TurndownService from "turndown"; interface PleromaFeedConfig { instanceUrl: string; username: string; maxPosts?: number; accountId?: string; // Optional: if provided, skips account lookup allowedTags?: string[]; // Optional: if provided, only posts with these tags are included mergeThreads?: boolean; // Optional: if true, merges thread posts into single entry (default: true) } interface PleromaAccount { id: string; username: string; acct: string; display_name: string; url: string; } interface PleromaMediaAttachment { id: string; type: "image" | "video" | "gifv" | "audio" | "unknown"; url: string; preview_url: string; description?: string; } interface PleromaStatus { id: string; created_at: string; content: string; url: string; reblog: PleromaStatus | null; in_reply_to_id: string | null; sensitive: boolean; media_attachments: PleromaMediaAttachment[]; visibility: string; account: PleromaAccount; } /** * Detect if a post is a thread starter by checking for thread markers * Matches patterns like: ๐Ÿงต, ๐Ÿ‘‡, โฌ‡๏ธ, 1/n, (1/n), [1/n], Thread:, etc. */ function isThreadStarter(content: string): boolean { // Check for thread emojis const threadEmojis = ["๐Ÿงต", "๐Ÿ‘‡", "โฌ‡๏ธ", "๐Ÿ“", "๐Ÿ“–", "โคต๏ธ", "๐Ÿ”ฝ"]; if (threadEmojis.some((emoji) => content.includes(emoji))) { return true; } // Check for numbered thread patterns: // - 1/n, 1/*, 1/2, 1/10 (plain) // - (1/n), (1/*), (1/2) (parentheses) // - [1/n], [1/*], [1/2] (brackets) const numberedPatterns = [ /\b1\/([n*]|\d+)\b/i, // 1/n, 1/*, 1/2 /\(1\/([n*]|\d+)\)/i, // (1/n), (1/*) /\[1\/([n*]|\d+)\]/i, // [1/n], [1/*] ]; if (numberedPatterns.some((pattern) => pattern.test(content))) { return true; } // Check for text markers (case insensitive) const textMarkers = [ /\bthread:/i, // Thread: /\[thread\]/i, // [Thread] /^thread about/i, // Thread about... (start of text) /^a thread about/i, // A thread about... ]; if (textMarkers.some((pattern) => pattern.test(content))) { return true; } return false; } /** * Parse the Link header to extract the max_id for the next page * Link header format: ; rel="next", ; rel="prev" */ function parseNextPageMaxId(linkHeader: string | null): string | null { if (!linkHeader) { return null; } // Split by comma to get individual links const links = linkHeader.split(","); for (const link of links) { // Check if this is the "next" rel link if (link.includes('rel="next"')) { // Extract URL from angle brackets const urlMatch = link.match(/<([^>]+)>/); if (urlMatch?.[1]) { // Parse the URL to extract max_id parameter try { const url = new URL(urlMatch[1]); const maxId = url.searchParams.get("max_id"); return maxId; } catch {} } } } return null; } /** * Fetch the context (ancestors and descendants) for a given status * Returns only the descendants array for thread building */ async function fetchStatusContext( instanceUrl: string, statusId: string, logger: any, ): Promise { try { const contextUrl = `${instanceUrl}/api/v1/statuses/${statusId}/context`; logger.info(`Fetching context for status: ${statusId}`); const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), 10000); const response = await fetch(contextUrl, { headers: { "User-Agent": "Astro Blog (pleroma-loader)", }, signal: controller.signal, }); clearTimeout(timeoutId); if (!response.ok) { logger.warn(`Failed to fetch context: HTTP ${response.status}`); return []; } const context: { ancestors: PleromaStatus[]; descendants: PleromaStatus[] } = await response.json(); logger.info(`Fetched ${context.descendants.length} descendants for status ${statusId}`); return context.descendants; } catch (error) { logger.warn(`Failed to fetch status context: ${error}`); return []; } } /** * Build a direct author-to-author reply chain from the thread starter * Stops when encountering a reply from another user or a missing link */ function buildAuthorChain(starter: PleromaStatus, descendants: PleromaStatus[]): PleromaStatus[] { const chain: PleromaStatus[] = [starter]; const authorAccountId = starter.account.id; let currentId = starter.id; // Keep following the chain as long as we find direct author replies while (true) { // Find the next post in the chain: it must be by the same author and reply to the current post const nextPost = descendants.find( (status) => status.in_reply_to_id === currentId && status.account.id === authorAccountId, ); if (!nextPost) { // No more direct author replies found, chain ends here break; } chain.push(nextPost); currentId = nextPost.id; } return chain; } /** * Strip thread markers from content (1/n, 2/n, 3/4, etc.) */ function stripThreadMarkers(content: string): string { return content .replace(/\s*\d+\/[n*\d]+\s*/gi, " ") .replace(/๐Ÿงต/g, "") .trim(); } /** * Merge thread posts into a single content structure with image grids per segment */ function mergeThreadContent(chain: PleromaStatus[]): { content: string; attachments: Array<{ url: string; type: string }>; } { const segments: string[] = []; const allAttachments: Array<{ url: string; type: string }> = []; for (const post of chain) { // Clean and strip thread markers from content const cleanedContent = cleanContent(post.content || ""); const contentWithoutMarkers = stripThreadMarkers(cleanedContent); // Build segment with text let segment = contentWithoutMarkers; // Add image attachments as HTML grid after the text const imageAttachments = post.media_attachments.filter( (attachment) => attachment.type === "image", ); if (imageAttachments.length > 0) { // Build HTML grid for images const imageGrid = `
${imageAttachments .map((attachment) => { const description = attachment.description || "Image"; allAttachments.push({ url: attachment.url, type: `image/${attachment.url.split(".").pop() || "jpeg"}`, }); return ` ${description} `; }) .join("\n")}
`; segment = `${segment}\n\n${imageGrid}`; } segments.push(segment); } // Join segments with horizontal rule separator const content = segments.join("\n\n---\n\n"); return { content, attachments: [] }; // Return empty attachments to avoid duplicate grid at end } async function getAccountId( instanceUrl: string, username: string, logger: any, ): Promise { try { const searchUrl = `${instanceUrl}/api/v1/accounts/search?q=${encodeURIComponent(username)}&limit=1`; logger.info(`Looking up account ID for username: ${username}`); const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), 10000); const response = await fetch(searchUrl, { headers: { "User-Agent": "Astro Blog (pleroma-loader)", }, signal: controller.signal, }); clearTimeout(timeoutId); if (!response.ok) { logger.warn(`Failed to search for account: HTTP ${response.status}`); return null; } const accounts: PleromaAccount[] = await response.json(); if (accounts.length === 0 || !accounts[0]) { logger.warn(`No account found for username: ${username}`); return null; } const account = accounts[0]; logger.info(`Found account ID: ${account.id} for @${account.acct}`); return account.id; } catch (error) { logger.warn(`Failed to lookup account ID: ${error}`); return null; } } async function fetchAccountStatuses( instanceUrl: string, accountId: string, maxPosts: number, logger: any, ): Promise { const allStatuses: PleromaStatus[] = []; let maxId: string | null = null; let pageCount = 0; const pageLimit = 40; // Mastodon/Pleroma API max per page const fetchAll = maxPosts === -1; // Fetch pages until we have enough posts or no more pages available while (fetchAll || allStatuses.length < maxPosts) { pageCount++; let response: Response | undefined; let lastError: unknown; // Build URL with pagination parameters // If fetching all, always use pageLimit; otherwise calculate remaining const requestLimit = fetchAll ? pageLimit : Math.min(pageLimit, maxPosts - allStatuses.length); const params = new URLSearchParams({ limit: String(requestLimit), exclude_replies: "true", exclude_reblogs: "true", }); if (maxId) { params.set("max_id", maxId); } const statusesUrl = `${instanceUrl}/api/v1/accounts/${accountId}/statuses?${params.toString()}`; // Add retry logic for network issues for (let attempt = 1; attempt <= 3; attempt++) { try { const modeMsg = fetchAll ? " [fetching all posts]" : ` [target: ${maxPosts}]`; logger.info( `Attempt ${attempt} to fetch statuses page ${pageCount}${maxId ? ` (max_id: ${maxId})` : ""}${modeMsg}...`, ); // Create timeout controller const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), 10000); response = await fetch(statusesUrl, { headers: { "User-Agent": "Astro Blog (pleroma-loader)", }, signal: controller.signal, }); clearTimeout(timeoutId); if (response.ok) { break; // Success, exit retry loop } throw new Error(`HTTP ${response.status}: ${response.statusText}`); } catch (error) { lastError = error; logger.warn(`Attempt ${attempt} failed: ${error}`); if (attempt < 3) { logger.info("Retrying in 2 seconds..."); await new Promise((resolve) => setTimeout(resolve, 2000)); } } } if (!response || !response.ok) { throw new Error(`Failed to fetch statuses after 3 attempts. Last error: ${lastError}`); } const statuses: PleromaStatus[] = await response.json(); logger.info(`Fetched ${statuses.length} statuses from page ${pageCount}`); // If no statuses returned, we've reached the end if (statuses.length === 0) { logger.info("No more statuses available"); break; } // Add statuses to our accumulated list allStatuses.push(...statuses); // Parse Link header to get next page max_id const linkHeader = response.headers.get("link"); const nextMaxId = parseNextPageMaxId(linkHeader); if (!nextMaxId) { logger.info("No more pages available (no next link in header)"); break; } // If the max_id hasn't changed, we're stuck in a loop - break if (nextMaxId === maxId) { logger.warn("Pagination returned same max_id, stopping to prevent infinite loop"); break; } maxId = nextMaxId; } const summaryMsg = fetchAll ? `Total fetched: ${allStatuses.length} statuses (all available) across ${pageCount} page(s)` : `Total fetched: ${allStatuses.length} statuses (target: ${maxPosts}) across ${pageCount} page(s)`; logger.info(summaryMsg); return allStatuses; } function isFilteredStatus(status: PleromaStatus): boolean { // Filter out boosts/reblogs (already handled by API parameter, but double-check) if (status.reblog) { return true; } // Filter out replies (already handled by API parameter, but double-check) if (status.in_reply_to_id) { return true; } // Filter out NSFW/sensitive content if (status.sensitive) { return true; } return false; } function extractHashtags(htmlContent: string): string[] { // Extract hashtags from HTML spans and plain text const hashtagPattern = /#(\w+)/gi; const matches = htmlContent.match(hashtagPattern); return matches ? [...new Set(matches.map((tag) => tag.toLowerCase()))] : []; } function hasAllowedTag(status: PleromaStatus, allowedTags: string[]): boolean { if (!allowedTags || allowedTags.length === 0) { return true; // No filtering if no tags specified } const content = status.content || ""; const hashtags = extractHashtags(content); const normalizedAllowedTags = allowedTags.map((tag) => tag.toLowerCase().replace(/^#/, "")); const normalizedHashtags = hashtags.map((tag) => tag.toLowerCase().replace(/^#/, "")); return normalizedHashtags.some((tag) => normalizedAllowedTags.includes(tag)); } function cleanContent(htmlContent: string): string { const turndownService = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced", }); // Remove or replace common Pleroma/Mastodon elements const cleanedContent = htmlContent .replace(/]*>/gi, "") // Remove mention spans but keep content .replace(/<\/span>/gi, "") .replace(/]*>/gi, "") // Remove hashtag spans but keep content .replace(/]*>.*?<\/span>/gi, "") // Remove ellipsis .replace(/]*>.*?<\/span>/gi, ""); // Remove invisible text // Convert to markdown const markdown = turndownService.turndown(cleanedContent); // Clean up extra whitespace return markdown.trim().replace(/\n\s*\n\s*\n/g, "\n\n"); } function markdownToHtml(markdown: string): string { // Configure marked options for safe rendering marked.setOptions({ breaks: true, // Convert line breaks to
gfm: true, // GitHub flavored markdown }); // Convert markdown to HTML const html = marked.parse(markdown); // Return as string (marked.parse can return string or Promise) return typeof html === "string" ? html : ""; } function extractTitle(content: string): string { // Extract first line or first sentence as title const firstLine = content.split("\n")[0]; if (!firstLine) return "Micro post"; const firstSentence = firstLine.split(/[.!?]/)[0]; if (!firstSentence) return "Micro post"; // Limit title length and clean it up const title = (firstSentence.length > 60 ? `${firstSentence.substring(0, 57)}...` : firstSentence) .replace(/[#*_`]/g, "") // Remove markdown formatting .trim(); return title || "Micro post"; } export function pleromaLoader(config: PleromaFeedConfig): Loader { return { name: "pleroma-loader", load: async ({ store, logger }) => { try { const { instanceUrl, username, maxPosts = 20, accountId: configAccountId } = config; logger.info(`Fetching Pleroma posts via API for user: ${username}`); // Get account ID (use provided one or lookup by username) let accountId: string | undefined = configAccountId; if (!accountId) { const lookedUpAccountId = await getAccountId(instanceUrl, username, logger); if (!lookedUpAccountId) { logger.warn("Failed to get account ID. Continuing without Pleroma posts..."); store.clear(); return; } accountId = lookedUpAccountId; } // Fetch statuses from API const statuses = await fetchAccountStatuses(instanceUrl, accountId, maxPosts, logger); logger.info(`Fetched ${statuses.length} statuses from API`); // Filter statuses const validStatuses = statuses.filter((status) => { if (isFilteredStatus(status)) return false; if (config.allowedTags && !hasAllowedTag(status, config.allowedTags)) return false; return true; }); logger.info(`After filtering: ${validStatuses.length} valid posts`); // Clear existing entries store.clear(); // Process each status for (const status of validStatuses) { try { const content = status.content || ""; let cleanedContent: string; let attachments: Array<{ url: string; type: string }>; let postId: string; let sourceUrl: string; // Check if this is a thread starter and thread merging is enabled if (config.mergeThreads !== false && isThreadStarter(content)) { logger.info(`Detected thread starter: ${status.id}`); // Fetch context and build the author chain const descendants = await fetchStatusContext(instanceUrl, status.id, logger); const chain = buildAuthorChain(status, descendants); logger.info(`Built chain with ${chain.length} post(s) for thread ${status.id}`); // Merge thread content const merged = mergeThreadContent(chain); cleanedContent = merged.content; attachments = merged.attachments; postId = status.id; sourceUrl = status.url; } else { // Process as single post cleanedContent = cleanContent(content); postId = status.id; sourceUrl = status.url; // Extract image attachments only attachments = status.media_attachments .filter((attachment) => attachment.type === "image") .map((attachment) => ({ url: attachment.url, type: `image/${attachment.url.split(".").pop() || "jpeg"}`, })); } const title = extractTitle(cleanedContent); // Create note entry store.set({ id: `pleroma-${postId}`, data: { title, description: cleanedContent.substring(0, 160) + (cleanedContent.length > 160 ? "..." : ""), publishDate: new Date(status.created_at), sourceUrl, attachments, }, body: cleanedContent, rendered: { html: markdownToHtml(cleanedContent), }, }); logger.info(`Processed post: ${title.substring(0, 50)}...`); } catch (error) { logger.warn(`Failed to process status ${status.id}: ${error}`); } } logger.info(`Successfully loaded ${validStatuses.length} Pleroma posts`); } catch (error) { logger.warn(`Pleroma loader failed: ${error}`); logger.info("Continuing build without Pleroma posts..."); // Don't throw error to prevent build failure store.clear(); } }, }; }