import type { Loader } from "astro/loaders"; import { marked } from "marked"; import TurndownService from "turndown"; interface Logger { info: (message: string) => void; warn: (message: string) => void; } interface PleromaFeedConfig { instanceUrl: string; username: string; maxPosts?: number; accountId?: string; // Optional: if provided, skips account lookup allowedTags?: string[]; // Optional: if provided, only posts with these tags are included mergeThreads?: boolean; // Optional: if true, merges thread posts into single entry (default: true) } interface PleromaAccount { id: string; username: string; acct: string; display_name: string; url: string; } interface PleromaMediaAttachment { id: string; type: "image" | "video" | "gifv" | "audio" | "unknown"; url: string; preview_url: string; description?: string; } interface PleromaStatus { id: string; created_at: string; content: string; url: string; reblog: PleromaStatus | null; in_reply_to_id: string | null; sensitive: boolean; media_attachments: PleromaMediaAttachment[]; visibility: string; account: PleromaAccount; language?: string | null; } /** * Detect if a post is a thread starter by checking for thread markers * Matches patterns like: ๐Ÿงต, ๐Ÿ‘‡, โฌ‡๏ธ, 1/n, (1/n), [1/n], Thread:, etc. */ function isThreadStarter(content: string): boolean { // Check for thread emojis const threadEmojis = ["๐Ÿงต", "๐Ÿ‘‡", "โฌ‡๏ธ", "๐Ÿ“", "๐Ÿ“–", "โคต๏ธ", "๐Ÿ”ฝ"]; if (threadEmojis.some((emoji) => content.includes(emoji))) { return true; } // Check for numbered thread patterns: // - 1/n, 1/*, 1/2, 1/10 (plain) // - (1/n), (1/*), (1/2) (parentheses) // - [1/n], [1/*], [1/2] (brackets) const numberedPatterns = [ /\b1\/([n*]|\d+)\b/i, // 1/n, 1/*, 1/2 /\(1\/([n*]|\d+)\)/i, // (1/n), (1/*) /\[1\/([n*]|\d+)\]/i, // [1/n], [1/*] ]; if (numberedPatterns.some((pattern) => pattern.test(content))) { return true; } // Check for text markers (case insensitive) const textMarkers = [ /\bthread:/i, // Thread: /\[thread\]/i, // [Thread] /^thread about/i, // Thread about... (start of text) /^a thread about/i, // A thread about... ]; if (textMarkers.some((pattern) => pattern.test(content))) { return true; } return false; } /** * Parse the Link header to extract the max_id for the next page * Link header format: ; rel="next", ; rel="prev" */ function parseNextPageMaxId(linkHeader: string | null): string | null { if (!linkHeader) { return null; } // Split by comma to get individual links const links = linkHeader.split(","); for (const link of links) { // Check if this is the "next" rel link if (link.includes('rel="next"')) { // Extract URL from angle brackets const urlMatch = link.match(/<([^>]+)>/); if (urlMatch?.[1]) { // Parse the URL to extract max_id parameter try { const url = new URL(urlMatch[1]); const maxId = url.searchParams.get("max_id"); return maxId; } catch {} } } } return null; } /** * Fetch the context (ancestors and descendants) for a given status * Returns only the descendants array for thread building */ async function fetchStatusContext( instanceUrl: string, statusId: string, logger: Logger, ): Promise { try { const contextUrl = `${instanceUrl}/api/v1/statuses/${statusId}/context`; logger.info(`Fetching context for status: ${statusId}`); const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), 10000); const response = await fetch(contextUrl, { headers: { "User-Agent": "Astro Blog (pleroma-loader)", }, signal: controller.signal, }); clearTimeout(timeoutId); if (!response.ok) { logger.warn(`Failed to fetch context: HTTP ${response.status}`); return []; } const context: { ancestors: PleromaStatus[]; descendants: PleromaStatus[] } = await response.json(); logger.info(`Fetched ${context.descendants.length} descendants for status ${statusId}`); return context.descendants; } catch (error) { logger.warn(`Failed to fetch status context: ${error}`); return []; } } /** * Build a direct author-to-author reply chain from the thread starter * Stops when encountering a reply from another user or a missing link */ function buildAuthorChain(starter: PleromaStatus, descendants: PleromaStatus[]): PleromaStatus[] { const chain: PleromaStatus[] = [starter]; const authorAccountId = starter.account.id; let currentId = starter.id; // Keep following the chain as long as we find direct author replies while (true) { // Find the next post in the chain: it must be by the same author and reply to the current post const nextPost = descendants.find( (status) => status.in_reply_to_id === currentId && status.account.id === authorAccountId, ); if (!nextPost) { // No more direct author replies found, chain ends here break; } chain.push(nextPost); currentId = nextPost.id; } return chain; } /** * Strip thread markers from content (1/n, 2/n, 3/4, etc.) */ function stripThreadMarkers(content: string): string { return content .replace(/\s*\d+\/[n*\d]+\s*/gi, " ") .replace(/๐Ÿงต/g, "") .trim(); } /** * Extract trailing hashtags from content * Handles both hashtag-only lines and hashtags at the end of text lines * Returns the main content without trailing tags and the extracted tags */ function extractTrailingHashtags(content: string): { mainContent: string; tags: string[]; } { const tags: string[] = []; let modifiedContent = content; // Regex patterns const hashtagOnlyLine = /^\s*((\[#\w+\]\([^)]+\)|#\w+)\s*)+\s*$/; const trailingHashtags = /((?:\s*(?:\[#\w+\]\([^)]+\)|#\w+))+)\s*$/; const hashtagExtract = /\[#(\w+)\]\([^)]+\)|#(\w+)/g; // First, handle hashtag-only lines at the end const lines = modifiedContent.split("\n"); while (lines.length > 0) { const lastLine = lines[lines.length - 1]?.trim() || ""; if (!lastLine) { lines.pop(); // Remove empty trailing lines continue; } if (hashtagOnlyLine.test(lastLine)) { // Extract tag names from this line let match: RegExpExecArray | null = hashtagExtract.exec(lastLine); while (match !== null) { const tag = match[1] || match[2]; if (tag) { tags.push(tag.toLowerCase()); } match = hashtagExtract.exec(lastLine); } hashtagExtract.lastIndex = 0; lines.pop(); } else { break; } } modifiedContent = lines.join("\n"); // Second, handle trailing hashtags at the end of the last line (even if there's other text) const trailingMatch = modifiedContent.match(trailingHashtags); if (trailingMatch?.[1]) { const trailingText = trailingMatch[1]; // Extract tag names from trailing hashtags let match: RegExpExecArray | null = hashtagExtract.exec(trailingText); while (match !== null) { const tag = match[1] || match[2]; if (tag) { tags.push(tag.toLowerCase()); } match = hashtagExtract.exec(trailingText); } hashtagExtract.lastIndex = 0; // Remove trailing hashtags from content modifiedContent = modifiedContent.replace(trailingHashtags, "").trim(); } return { mainContent: modifiedContent, tags: [...new Set(tags)], // Deduplicate within this segment }; } /** * Merge thread posts into a single content structure with image grids per segment */ function mergeThreadContent(chain: PleromaStatus[]): string { const segments: string[] = []; const allTags = new Set(); // Collect all tags from all segments for (const post of chain) { // Clean and strip thread markers from content const cleanedContent = cleanContent(post.content || ""); const contentWithoutMarkers = stripThreadMarkers(cleanedContent); // Extract trailing hashtags from content const { mainContent, tags } = extractTrailingHashtags(contentWithoutMarkers); tags.forEach((tag) => allTags.add(tag)); // Build segment with text (without trailing hashtags) let segment = mainContent; // Add image attachments as HTML grid after the text const imageGrid = buildImageGridHtml(post.media_attachments); if (imageGrid) { segment = `${segment}\n\n${imageGrid}`; } segments.push(segment); } // Join segments with horizontal rule separator let content = segments.join("\n\n---\n\n"); // Append consolidated tags at the end as plain hashtags if (allTags.size > 0) { const tagLine = [...allTags].map((t) => `#${t}`).join(" "); content = `${content}\n\n${tagLine}`; } return content; } async function getAccountId( instanceUrl: string, username: string, logger: Logger, ): Promise { try { const searchUrl = `${instanceUrl}/api/v1/accounts/search?q=${encodeURIComponent(username)}&limit=1`; logger.info(`Looking up account ID for username: ${username}`); const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), 10000); const response = await fetch(searchUrl, { headers: { "User-Agent": "Astro Blog (pleroma-loader)", }, signal: controller.signal, }); clearTimeout(timeoutId); if (!response.ok) { logger.warn(`Failed to search for account: HTTP ${response.status}`); return null; } const accounts: PleromaAccount[] = await response.json(); if (accounts.length === 0 || !accounts[0]) { logger.warn(`No account found for username: ${username}`); return null; } const account = accounts[0]; logger.info(`Found account ID: ${account.id} for @${account.acct}`); return account.id; } catch (error) { logger.warn(`Failed to lookup account ID: ${error}`); return null; } } async function fetchAccountStatuses( instanceUrl: string, accountId: string, maxPosts: number, logger: Logger, ): Promise { const allStatuses: PleromaStatus[] = []; let maxId: string | null = null; let pageCount = 0; const pageLimit = 40; // Mastodon/Pleroma API max per page const fetchAll = maxPosts === -1; // Fetch pages until we have enough posts or no more pages available while (fetchAll || allStatuses.length < maxPosts) { pageCount++; let response: Response | undefined; let lastError: unknown; // Build URL with pagination parameters // If fetching all, always use pageLimit; otherwise calculate remaining const requestLimit = fetchAll ? pageLimit : Math.min(pageLimit, maxPosts - allStatuses.length); const params = new URLSearchParams({ limit: String(requestLimit), exclude_replies: "true", exclude_reblogs: "true", }); if (maxId) { params.set("max_id", maxId); } const statusesUrl = `${instanceUrl}/api/v1/accounts/${accountId}/statuses?${params.toString()}`; // Add retry logic for network issues for (let attempt = 1; attempt <= 3; attempt++) { try { const modeMsg = fetchAll ? " [fetching all posts]" : ` [target: ${maxPosts}]`; logger.info( `Attempt ${attempt} to fetch statuses page ${pageCount}${maxId ? ` (max_id: ${maxId})` : ""}${modeMsg}...`, ); // Create timeout controller const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), 10000); response = await fetch(statusesUrl, { headers: { "User-Agent": "Astro Blog (pleroma-loader)", }, signal: controller.signal, }); clearTimeout(timeoutId); if (response.ok) { break; // Success, exit retry loop } throw new Error(`HTTP ${response.status}: ${response.statusText}`); } catch (error) { lastError = error; logger.warn(`Attempt ${attempt} failed: ${error}`); if (attempt < 3) { logger.info("Retrying in 2 seconds..."); await new Promise((resolve) => setTimeout(resolve, 2000)); } } } if (!response || !response.ok) { throw new Error(`Failed to fetch statuses after 3 attempts. Last error: ${lastError}`); } const statuses: PleromaStatus[] = await response.json(); logger.info(`Fetched ${statuses.length} statuses from page ${pageCount}`); // If no statuses returned, we've reached the end if (statuses.length === 0) { logger.info("No more statuses available"); break; } // Add statuses to our accumulated list allStatuses.push(...statuses); // Parse Link header to get next page max_id const linkHeader = response.headers.get("link"); const nextMaxId = parseNextPageMaxId(linkHeader); if (!nextMaxId) { logger.info("No more pages available (no next link in header)"); break; } // If the max_id hasn't changed, we're stuck in a loop - break if (nextMaxId === maxId) { logger.warn("Pagination returned same max_id, stopping to prevent infinite loop"); break; } maxId = nextMaxId; } const summaryMsg = fetchAll ? `Total fetched: ${allStatuses.length} statuses (all available) across ${pageCount} page(s)` : `Total fetched: ${allStatuses.length} statuses (target: ${maxPosts}) across ${pageCount} page(s)`; logger.info(summaryMsg); return allStatuses; } function isFilteredStatus(status: PleromaStatus): boolean { // Filter out boosts/reblogs (already handled by API parameter, but double-check) if (status.reblog) { return true; } // Filter out replies (already handled by API parameter, but double-check) if (status.in_reply_to_id) { return true; } // Filter out NSFW/sensitive content if (status.sensitive) { return true; } return false; } function extractHashtags(htmlContent: string): string[] { // Extract hashtags from HTML spans and plain text const hashtagPattern = /#(\w+)/gi; const matches = htmlContent.match(hashtagPattern); return matches ? [...new Set(matches.map((tag) => tag.toLowerCase()))] : []; } function hasAllowedTag(status: PleromaStatus, allowedTags: string[]): boolean { if (!allowedTags || allowedTags.length === 0) { return true; // No filtering if no tags specified } const content = status.content || ""; const hashtags = extractHashtags(content); const normalizedAllowedTags = allowedTags.map((tag) => tag.toLowerCase().replace(/^#/, "")); const normalizedHashtags = hashtags.map((tag) => tag.toLowerCase().replace(/^#/, "")); return normalizedHashtags.some((tag) => normalizedAllowedTags.includes(tag)); } function cleanContent(htmlContent: string): string { const turndownService = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced", }); // Remove or replace common Pleroma/Mastodon elements const cleanedContent = htmlContent .replace(/]*>/gi, "") // Remove mention spans but keep content .replace(/<\/span>/gi, "") .replace(/]*>/gi, "") // Remove hashtag spans but keep content .replace(/]*>.*?<\/span>/gi, "") // Remove ellipsis .replace(/]*>.*?<\/span>/gi, ""); // Remove invisible text // Convert to markdown const markdown = turndownService.turndown(cleanedContent); // Clean up extra whitespace return markdown.trim().replace(/\n\s*\n\s*\n/g, "\n\n"); } /** * Build HTML grid for image attachments * Returns empty string if no image attachments are provided */ function buildImageGridHtml(attachments: PleromaMediaAttachment[]): string { const imageAttachments = attachments.filter((attachment) => attachment.type === "image"); if (imageAttachments.length === 0) { return ""; } return `
${imageAttachments .map((attachment) => { const description = attachment.description || "Image"; return ` ${description} `; }) .join("\n")}
`; } /** * Replace all hashtags in content with internal tag links * Handles both plain #hashtags and existing markdown links [#tag](url) * Returns modified content and extracted tags array */ function replaceHashtagsWithLinks(content: string): { content: string; tags: string[]; } { const tags: string[] = []; // First, replace existing markdown hashtag links: [#tag](any-url) let modifiedContent = content.replace(/\[#(\w+)\]\([^)]+\)/g, (_match, tag) => { tags.push(tag.toLowerCase()); return `[#${tag}](/tags/${tag.toLowerCase()})`; }); // Then, replace plain #hashtags (not already in markdown link format) // Negative lookbehind to avoid matching hashtags already in [#tag] format modifiedContent = modifiedContent.replace(/(? { tags.push(tag.toLowerCase()); return `[#${tag}](/tags/${tag.toLowerCase()})`; }); return { content: modifiedContent, tags: [...new Set(tags)], // Deduplicate }; } /** * Replace Pleroma notice links with internal links when the post exists in our collection * Handles both markdown links and plain URLs */ function replacePleromaLinks( content: string, instanceUrl: string, existingPostIds: Set, ): string { // Escape special regex characters in instanceUrl const escapedInstanceUrl = instanceUrl.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); // Pattern to match notice URLs - captures the statusId const noticePattern = `${escapedInstanceUrl}/notice/([A-Za-z0-9]+)`; // Replace markdown links: [text](url) const markdownLinkRegex = new RegExp(`\\[([^\\]]+)\\]\\(${noticePattern}\\)`, "g"); let modifiedContent = content.replace(markdownLinkRegex, (match, linkText, statusId) => { if (existingPostIds.has(statusId)) { return `[${linkText}](/posts/pleroma-${statusId}/)`; } return match; // Keep original if post doesn't exist }); // Replace plain URLs (not already in markdown link format) // Use negative lookbehind to avoid matching URLs already in markdown links const plainUrlRegex = new RegExp(`(? { if (existingPostIds.has(statusId)) { return `/posts/pleroma-${statusId}/`; } return match; // Keep original if post doesn't exist }); return modifiedContent; } function markdownToHtml(markdown: string): string { // Configure marked options for safe rendering marked.setOptions({ breaks: true, // Convert line breaks to
gfm: true, // GitHub flavored markdown }); // Convert markdown to HTML const html = marked.parse(markdown); // Return as string (marked.parse can return string or Promise) return typeof html === "string" ? html : ""; } function extractTitle(content: string): string { // Extract first line or first sentence as title const firstLine = content.split("\n")[0]; if (!firstLine) return "Micro post"; const firstSentence = firstLine.split(/[.!?]/)[0]; if (!firstSentence) return "Micro post"; // Limit title length and clean it up const title = (firstSentence.length > 60 ? `${firstSentence.substring(0, 57)}...` : firstSentence) .replace(/[#*_`]/g, "") // Remove markdown formatting .trim(); return title || "Micro post"; } export function pleromaLoader(config: PleromaFeedConfig): Loader { return { name: "pleroma-loader", load: async ({ store, logger }) => { try { const { instanceUrl, username, maxPosts = 20, accountId: configAccountId } = config; logger.info(`Fetching Pleroma posts via API for user: ${username}`); // Get account ID (use provided one or lookup by username) let accountId: string | undefined = configAccountId; if (!accountId) { const lookedUpAccountId = await getAccountId(instanceUrl, username, logger); if (!lookedUpAccountId) { logger.warn("Failed to get account ID. Continuing without Pleroma posts..."); store.clear(); return; } accountId = lookedUpAccountId; } // Fetch statuses from API const statuses = await fetchAccountStatuses(instanceUrl, accountId, maxPosts, logger); logger.info(`Fetched ${statuses.length} statuses from API`); // Filter statuses const validStatuses = statuses.filter((status) => { if (isFilteredStatus(status)) return false; if (config.allowedTags && !hasAllowedTag(status, config.allowedTags)) return false; return true; }); logger.info(`After filtering: ${validStatuses.length} valid posts`); // Collect all post IDs for link replacement const allPostIds = new Set(validStatuses.map((status) => status.id)); // Clear existing entries store.clear(); // Process each status for (const status of validStatuses) { try { const content = status.content || ""; let cleanedContent: string; let postId: string; let sourceUrl: string; let tags: string[]; // Check if this is a thread starter and thread merging is enabled if (config.mergeThreads !== false && isThreadStarter(content)) { logger.info(`Detected thread starter: ${status.id}`); // Fetch context and build the author chain const descendants = await fetchStatusContext(instanceUrl, status.id, logger); const chain = buildAuthorChain(status, descendants); logger.info(`Built chain with ${chain.length} post(s) for thread ${status.id}`); // Merge thread content const mergedContent = mergeThreadContent(chain); const { content: contentWithTags, tags: extractedTags } = replaceHashtagsWithLinks(mergedContent); // Add microblog tag if not already present tags = extractedTags.includes("microblog") ? extractedTags : [...extractedTags, "microblog"]; cleanedContent = replacePleromaLinks(contentWithTags, instanceUrl, allPostIds); postId = status.id; sourceUrl = status.url; } else { // Process as single post const rawContent = cleanContent(content); const { content: contentWithTags, tags: extractedTags } = replaceHashtagsWithLinks(rawContent); // Add microblog tag if not already present tags = extractedTags.includes("microblog") ? extractedTags : [...extractedTags, "microblog"]; const contentWithLinks = replacePleromaLinks( contentWithTags, instanceUrl, allPostIds, ); // Build image grid HTML and append to content for RSS feeds const imageGrid = buildImageGridHtml(status.media_attachments); cleanedContent = imageGrid ? `${contentWithLinks}\n\n${imageGrid}` : contentWithLinks; postId = status.id; sourceUrl = status.url; } const title = extractTitle(cleanedContent); // Add language code as a tag // Default to Polish since most Pleroma posts are in Polish when language is not specified const langTag = status.language || "pl"; const postLanguage = status.language || "pl"; tags = tags.includes(langTag) ? tags : [...tags, langTag]; // Create note entry store.set({ id: `pleroma-${postId}`, data: { title, description: cleanedContent.substring(0, 160) + (cleanedContent.length > 160 ? "..." : ""), publishDate: new Date(status.created_at), sourceUrl, language: postLanguage, tags, draft: false, author: "Dawid", }, body: cleanedContent, rendered: { html: markdownToHtml(cleanedContent), }, }); logger.info(`Processed post: ${title.substring(0, 50)}...`); } catch (error) { logger.warn(`Failed to process status ${status.id}: ${error}`); } } logger.info(`Successfully loaded ${validStatuses.length} Pleroma posts`); } catch (error) { logger.warn(`Pleroma loader failed: ${error}`); logger.info("Continuing build without Pleroma posts..."); // Don't throw error to prevent build failure store.clear(); } }, }; }