diff options
Diffstat (limited to 'src/loaders')
| -rw-r--r-- | src/loaders/pleroma.ts | 375 |
1 files changed, 375 insertions, 0 deletions
diff --git a/src/loaders/pleroma.ts b/src/loaders/pleroma.ts new file mode 100644 index 0000000..dc6a05c --- /dev/null +++ b/src/loaders/pleroma.ts @@ -0,0 +1,375 @@ +import type { Loader } from "astro/loaders"; +import { XMLParser } from "fast-xml-parser"; +import TurndownService from "turndown"; + +interface PleromaFeedConfig { + instanceUrl: string; + username: string; + maxPosts?: number; + feedType?: "rss" | "atom"; +} + +interface RssItem { + guid: string; + title: string; + description: string; + pubDate: string; + link: string; + category?: string | string[]; + "activity:object-type"?: string; + "activity:verb"?: string; + "thr:in-reply-to"?: { + "@_ref": string; + }; +} + +interface RssFeed { + rss: { + channel: { + title: string; + description: string; + link: string; + item?: RssItem | RssItem[]; + }; + }; +} + +interface AtomEntry { + id: string; + title: string; + content: { + "#text": string; + "@_type": string; + }; + published: string; + updated: string; + link: { + "@_href": string; + "@_rel": string; + "@_type": string; + }[]; + author: { + name: string; + uri: string; + }; + category?: { + "@_term": string; + }[]; + "activity:object-type"?: string; + "activity:verb"?: string; + "thr:in-reply-to"?: { + "@_ref": string; + }; +} + +interface AtomFeed { + feed: { + title: string; + id: string; + updated: string; + entry?: AtomEntry | AtomEntry[]; + }; +} + +function parseAtomFeed(xmlContent: string): AtomEntry[] { + const parser = new XMLParser({ + ignoreAttributes: false, + attributeNamePrefix: "@_", + parseAttributeValue: true, + }); + + const result: AtomFeed = parser.parse(xmlContent); + + if (!result.feed?.entry) { + return []; + } + + // Handle both single entry and array of entries + const entries = Array.isArray(result.feed.entry) ? result.feed.entry : [result.feed.entry]; + + return entries; +} + +function parseRssFeed(xmlContent: string): RssItem[] { + const parser = new XMLParser({ + ignoreAttributes: false, + attributeNamePrefix: "@_", + parseAttributeValue: true, + }); + + try { + const result: RssFeed = parser.parse(xmlContent); + + if (!result.rss?.channel?.item) { + console.log("RSS structure:", JSON.stringify(result, null, 2)); + return []; + } + + // Handle both single item and array of items + const items = Array.isArray(result.rss.channel.item) + ? result.rss.channel.item + : [result.rss.channel.item]; + + return items; + } catch (error) { + console.error("Failed to parse RSS feed:", error); + console.log("XML content length:", xmlContent.length); + console.log("XML preview:", xmlContent.substring(0, 1000)); + return []; + } +} + +function isFilteredPostAtom(entry: AtomEntry): boolean { + // Filter out boosts/reblogs + if (entry["activity:verb"] === "http://activitystrea.ms/schema/1.0/share") { + return true; + } + + // Filter out replies + if (entry["thr:in-reply-to"]) { + return true; + } + + // Filter out NSFW/sensitive content + if (entry.category) { + const categories = Array.isArray(entry.category) ? entry.category : [entry.category]; + const hasNsfwTag = categories.some( + (cat) => + cat["@_term"]?.toLowerCase().includes("nsfw") || + cat["@_term"]?.toLowerCase().includes("sensitive"), + ); + if (hasNsfwTag) { + return true; + } + } + + return false; +} + +function isFilteredPostRss(item: RssItem): boolean { + // Filter out boosts/reblogs + if (item["activity:verb"] === "http://activitystrea.ms/schema/1.0/share") { + return true; + } + + // Filter out replies + if (item["thr:in-reply-to"]) { + return true; + } + + // Filter out NSFW/sensitive content + if (item.category) { + const categories = Array.isArray(item.category) ? item.category : [item.category]; + const hasNsfwTag = categories.some( + (cat) => cat?.toLowerCase().includes("nsfw") || cat?.toLowerCase().includes("sensitive"), + ); + if (hasNsfwTag) { + return true; + } + } + + return false; +} + +function cleanContent(htmlContent: string): string { + const turndownService = new TurndownService({ + headingStyle: "atx", + codeBlockStyle: "fenced", + }); + + // Remove or replace common Pleroma/Mastodon elements + const cleanedContent = htmlContent + .replace(/<span class="[^"]*mention[^"]*"[^>]*>/gi, "") // Remove mention spans but keep content + .replace(/<\/span>/gi, "") + .replace(/<span class="[^"]*hashtag[^"]*"[^>]*>/gi, "") // Remove hashtag spans but keep content + .replace(/<span class="[^"]*ellipsis[^"]*"[^>]*>.*?<\/span>/gi, "") // Remove ellipsis + .replace(/<span class="[^"]*invisible[^"]*"[^>]*>.*?<\/span>/gi, ""); // Remove invisible text + + // Convert to markdown + const markdown = turndownService.turndown(cleanedContent); + + // Clean up extra whitespace + return markdown.trim().replace(/\n\s*\n\s*\n/g, "\n\n"); +} + +function extractTitle(content: string): string { + // Extract first line or first sentence as title + const firstLine = content.split("\n")[0]; + if (!firstLine) return "Micro post"; + + const firstSentence = firstLine.split(/[.!?]/)[0]; + if (!firstSentence) return "Micro post"; + + // Limit title length and clean it up + const title = (firstSentence.length > 60 ? `${firstSentence.substring(0, 57)}...` : firstSentence) + .replace(/[#*_`]/g, "") // Remove markdown formatting + .trim(); + + return title || "Micro post"; +} + +export function pleromaLoader(config: PleromaFeedConfig): Loader { + return { + name: "pleroma-loader", + load: async ({ store, logger }) => { + try { + const { instanceUrl, username, maxPosts = 20 } = config; + // Use RSS URL that redirects to Atom - this bypasses some access restrictions + const feedUrl = `${instanceUrl}/users/${username}.rss`; + + logger.info(`Fetching Pleroma feed from: ${feedUrl}`); + + // Add retry logic for network issues + let response: Response | undefined; + let lastError: unknown; + + for (let attempt = 1; attempt <= 3; attempt++) { + try { + logger.info(`Attempt ${attempt} to fetch feed...`); + + // Create timeout controller + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), 10000); + + response = await fetch(feedUrl, { + headers: { + "User-Agent": "Astro Blog (pleroma-loader)", + }, + redirect: "follow", // Follow redirects + signal: controller.signal, + }); + + clearTimeout(timeoutId); + + if (response.ok) { + break; // Success, exit retry loop + } + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } catch (error) { + lastError = error; + logger.warn(`Attempt ${attempt} failed: ${error}`); + + if (attempt < 3) { + logger.info("Retrying in 2 seconds..."); + await new Promise((resolve) => setTimeout(resolve, 2000)); + } + } + } + + if (!response || !response.ok) { + logger.warn(`Failed to fetch Pleroma feed after 3 attempts. Last error: ${lastError}`); + logger.info("Continuing without Pleroma posts..."); + store.clear(); + return; + } + + const xmlContent = await response.text(); + logger.info(`Received XML content length: ${xmlContent.length}`); + + // Auto-detect if it's Atom or RSS based on content + const isAtomFeed = + xmlContent.includes("<feed") || + xmlContent.includes('xmlns="http://www.w3.org/2005/Atom"'); + logger.info(`Detected feed type: ${isAtomFeed ? "Atom" : "RSS"}`); + + let validEntries: AtomEntry[] = []; + + if (isAtomFeed) { + // Process as Atom feed + const entries = parseAtomFeed(xmlContent); + logger.info(`Parsed ${entries.length} entries from Atom feed`); + + validEntries = entries.filter((entry) => !isFilteredPostAtom(entry)).slice(0, maxPosts); + + logger.info(`After filtering: ${validEntries.length} valid posts`); + + // Clear existing entries + store.clear(); + + // Process each Atom entry + for (const entry of validEntries) { + try { + const content = entry.content?.["#text"] || ""; + const cleanedContent = cleanContent(content); + const title = extractTitle(cleanedContent); + + // Extract post ID from the entry ID + const postId = entry.id.split("/").pop() || entry.id; + + // Create note entry + store.set({ + id: `pleroma-${postId}`, + data: { + title, + description: + cleanedContent.substring(0, 160) + (cleanedContent.length > 160 ? "..." : ""), + publishDate: new Date(entry.published), + }, + body: cleanedContent, + rendered: { + html: `<p>${cleanedContent.replace(/\n\n/g, "</p><p>")}</p>`, + }, + }); + + logger.info(`Processed post: ${title.substring(0, 50)}...`); + } catch (error) { + logger.warn(`Failed to process entry ${entry.id}: ${error}`); + } + } + } else { + // Process as RSS feed + const items = parseRssFeed(xmlContent); + logger.info(`Parsed ${items.length} items from RSS feed`); + + const validRssItems = items.filter((item) => !isFilteredPostRss(item)).slice(0, maxPosts); + + logger.info(`After filtering: ${validRssItems.length} valid posts`); + + // Clear existing entries + store.clear(); + + // Process each RSS item + for (const item of validRssItems) { + try { + const content = item.description || ""; + const cleanedContent = cleanContent(content); + const title = extractTitle(cleanedContent); + + // Extract post ID from the GUID or link + const postId = + item.guid?.split("/").pop() || + (typeof item.link === "string" ? item.link.split("/").pop() : null) || + Math.random().toString(36); + + // Create note entry + store.set({ + id: `pleroma-${postId}`, + data: { + title, + description: + cleanedContent.substring(0, 160) + (cleanedContent.length > 160 ? "..." : ""), + publishDate: new Date(item.pubDate), + }, + body: cleanedContent, + rendered: { + html: `<p>${cleanedContent.replace(/\n\n/g, "</p><p>")}</p>`, + }, + }); + + logger.info(`Processed post: ${title.substring(0, 50)}...`); + } catch (error) { + logger.warn(`Failed to process RSS item ${item.guid}: ${error}`); + } + } + } + + logger.info(`Successfully loaded ${validEntries.length} Pleroma posts`); + } catch (error) { + logger.warn(`Pleroma loader failed: ${error}`); + logger.info("Continuing build without Pleroma posts..."); + // Don't throw error to prevent build failure + store.clear(); + } + }, + }; +} |
