1 files changed, 375 insertions, 0 deletions
diff --git a/src/loaders/pleroma.ts b/src/loaders/pleroma.ts
new file mode 100644
index 0000000..dc6a05c
--- /dev/null
+++ b/src/loaders/pleroma.ts
@@ -0,0 +1,375 @@
+import type { Loader } from "astro/loaders";
+import { XMLParser } from "fast-xml-parser";
+import TurndownService from "turndown";
+
+interface PleromaFeedConfig {
+	instanceUrl: string;
+	username: string;
+	maxPosts?: number;
+	feedType?: "rss" | "atom";
+}
+
+interface RssItem {
+	guid: string;
+	title: string;
+	description: string;
+	pubDate: string;
+	link: string;
+	category?: string | string[];
+	"activity:object-type"?: string;
+	"activity:verb"?: string;
+	"thr:in-reply-to"?: {
+		"@_ref": string;
+	};
+}
+
+interface RssFeed {
+	rss: {
+		channel: {
+			title: string;
+			description: string;
+			link: string;
+			item?: RssItem | RssItem[];
+		};
+	};
+}
+
+interface AtomEntry {
+	id: string;
+	title: string;
+	content: {
+		"#text": string;
+		"@_type": string;
+	};
+	published: string;
+	updated: string;
+	link: {
+		"@_href": string;
+		"@_rel": string;
+		"@_type": string;
+	}[];
+	author: {
+		name: string;
+		uri: string;
+	};
+	category?: {
+		"@_term": string;
+	}[];
+	"activity:object-type"?: string;
+	"activity:verb"?: string;
+	"thr:in-reply-to"?: {
+		"@_ref": string;
+	};
+}
+
+interface AtomFeed {
+	feed: {
+		title: string;
+		id: string;
+		updated: string;
+		entry?: AtomEntry | AtomEntry[];
+	};
+}
+
+function parseAtomFeed(xmlContent: string): AtomEntry[] {
+	const parser = new XMLParser({
+		ignoreAttributes: false,
+		attributeNamePrefix: "@_",
+		parseAttributeValue: true,
+	});
+
+	const result: AtomFeed = parser.parse(xmlContent);
+
+	if (!result.feed?.entry) {
+		return [];
+	}
+
+	// Handle both single entry and array of entries
+	const entries = Array.isArray(result.feed.entry) ? result.feed.entry : [result.feed.entry];
+
+	return entries;
+}
+
+function parseRssFeed(xmlContent: string): RssItem[] {
+	const parser = new XMLParser({
+		ignoreAttributes: false,
+		attributeNamePrefix: "@_",
+		parseAttributeValue: true,
+	});
+
+	try {
+		const result: RssFeed = parser.parse(xmlContent);
+
+		if (!result.rss?.channel?.item) {
+			console.log("RSS structure:", JSON.stringify(result, null, 2));
+			return [];
+		}
+
+		// Handle both single item and array of items
+		const items = Array.isArray(result.rss.channel.item)
+			? result.rss.channel.item
+			: [result.rss.channel.item];
+
+		return items;
+	} catch (error) {
+		console.error("Failed to parse RSS feed:", error);
+		console.log("XML content length:", xmlContent.length);
+		console.log("XML preview:", xmlContent.substring(0, 1000));
+		return [];
+	}
+}
+
+function isFilteredPostAtom(entry: AtomEntry): boolean {
+	// Filter out boosts/reblogs
+	if (entry["activity:verb"] === "http://activitystrea.ms/schema/1.0/share") {
+		return true;
+	}
+
+	// Filter out replies
+	if (entry["thr:in-reply-to"]) {
+		return true;
+	}
+
+	// Filter out NSFW/sensitive content
+	if (entry.category) {
+		const categories = Array.isArray(entry.category) ? entry.category : [entry.category];
+		const hasNsfwTag = categories.some(
+			(cat) =>
+				cat["@_term"]?.toLowerCase().includes("nsfw") ||
+				cat["@_term"]?.toLowerCase().includes("sensitive"),
+		);
+		if (hasNsfwTag) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+function isFilteredPostRss(item: RssItem): boolean {
+	// Filter out boosts/reblogs
+	if (item["activity:verb"] === "http://activitystrea.ms/schema/1.0/share") {
+		return true;
+	}
+
+	// Filter out replies
+	if (item["thr:in-reply-to"]) {
+		return true;
+	}
+
+	// Filter out NSFW/sensitive content
+	if (item.category) {
+		const categories = Array.isArray(item.category) ? item.category : [item.category];
+		const hasNsfwTag = categories.some(
+			(cat) => cat?.toLowerCase().includes("nsfw") || cat?.toLowerCase().includes("sensitive"),
+		);
+		if (hasNsfwTag) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+function cleanContent(htmlContent: string): string {
+	const turndownService = new TurndownService({
+		headingStyle: "atx",
+		codeBlockStyle: "fenced",
+	});
+
+	// Remove or replace common Pleroma/Mastodon elements
+	const cleanedContent = htmlContent
+		.replace(/<span class="[^"]*mention[^"]*"[^>]*>/gi, "") // Remove mention spans but keep content
+		.replace(/<\/span>/gi, "")
+		.replace(/<span class="[^"]*hashtag[^"]*"[^>]*>/gi, "") // Remove hashtag spans but keep content
+		.replace(/<span class="[^"]*ellipsis[^"]*"[^>]*>.*?<\/span>/gi, "") // Remove ellipsis
+		.replace(/<span class="[^"]*invisible[^"]*"[^>]*>.*?<\/span>/gi, ""); // Remove invisible text
+
+	// Convert to markdown
+	const markdown = turndownService.turndown(cleanedContent);
+
+	// Clean up extra whitespace
+	return markdown.trim().replace(/\n\s*\n\s*\n/g, "\n\n");
+}
+
+function extractTitle(content: string): string {
+	// Extract first line or first sentence as title
+	const firstLine = content.split("\n")[0];
+	if (!firstLine) return "Micro post";
+
+	const firstSentence = firstLine.split(/[.!?]/)[0];
+	if (!firstSentence) return "Micro post";
+
+	// Limit title length and clean it up
+	const title = (firstSentence.length > 60 ? `${firstSentence.substring(0, 57)}...` : firstSentence)
+		.replace(/[#*_`]/g, "") // Remove markdown formatting
+		.trim();
+
+	return title || "Micro post";
+}
+
+export function pleromaLoader(config: PleromaFeedConfig): Loader {
+	return {
+		name: "pleroma-loader",
+		load: async ({ store, logger }) => {
+			try {
+				const { instanceUrl, username, maxPosts = 20 } = config;
+				// Use RSS URL that redirects to Atom - this bypasses some access restrictions
+				const feedUrl = `${instanceUrl}/users/${username}.rss`;
+
+				logger.info(`Fetching Pleroma feed from: ${feedUrl}`);
+
+				// Add retry logic for network issues
+				let response: Response | undefined;
+				let lastError: unknown;
+
+				for (let attempt = 1; attempt <= 3; attempt++) {
+					try {
+						logger.info(`Attempt ${attempt} to fetch feed...`);
+
+						// Create timeout controller
+						const controller = new AbortController();
+						const timeoutId = setTimeout(() => controller.abort(), 10000);
+
+						response = await fetch(feedUrl, {
+							headers: {
+								"User-Agent": "Astro Blog (pleroma-loader)",
+							},
+							redirect: "follow", // Follow redirects
+							signal: controller.signal,
+						});
+
+						clearTimeout(timeoutId);
+
+						if (response.ok) {
+							break; // Success, exit retry loop
+						}
+							throw new Error(`HTTP ${response.status}: ${response.statusText}`);
+					} catch (error) {
+						lastError = error;
+						logger.warn(`Attempt ${attempt} failed: ${error}`);
+
+						if (attempt < 3) {
+							logger.info("Retrying in 2 seconds...");
+							await new Promise((resolve) => setTimeout(resolve, 2000));
+						}
+					}
+				}
+
+				if (!response || !response.ok) {
+					logger.warn(`Failed to fetch Pleroma feed after 3 attempts. Last error: ${lastError}`);
+					logger.info("Continuing without Pleroma posts...");
+					store.clear();
+					return;
+				}
+
+				const xmlContent = await response.text();
+				logger.info(`Received XML content length: ${xmlContent.length}`);
+
+				// Auto-detect if it's Atom or RSS based on content
+				const isAtomFeed =
+					xmlContent.includes("<feed") ||
+					xmlContent.includes('xmlns="http://www.w3.org/2005/Atom"');
+				logger.info(`Detected feed type: ${isAtomFeed ? "Atom" : "RSS"}`);
+
+				let validEntries: AtomEntry[] = [];
+
+				if (isAtomFeed) {
+					// Process as Atom feed
+					const entries = parseAtomFeed(xmlContent);
+					logger.info(`Parsed ${entries.length} entries from Atom feed`);
+
+					validEntries = entries.filter((entry) => !isFilteredPostAtom(entry)).slice(0, maxPosts);
+
+					logger.info(`After filtering: ${validEntries.length} valid posts`);
+
+					// Clear existing entries
+					store.clear();
+
+					// Process each Atom entry
+					for (const entry of validEntries) {
+						try {
+							const content = entry.content?.["#text"] || "";
+							const cleanedContent = cleanContent(content);
+							const title = extractTitle(cleanedContent);
+
+							// Extract post ID from the entry ID
+							const postId = entry.id.split("/").pop() || entry.id;
+
+							// Create note entry
+							store.set({
+								id: `pleroma-${postId}`,
+								data: {
+									title,
+									description:
+										cleanedContent.substring(0, 160) + (cleanedContent.length > 160 ? "..." : ""),
+									publishDate: new Date(entry.published),
+								},
+								body: cleanedContent,
+								rendered: {
+									html: `<p>${cleanedContent.replace(/\n\n/g, "</p><p>")}</p>`,
+								},
+							});
+
+							logger.info(`Processed post: ${title.substring(0, 50)}...`);
+						} catch (error) {
+							logger.warn(`Failed to process entry ${entry.id}: ${error}`);
+						}
+					}
+				} else {
+					// Process as RSS feed
+					const items = parseRssFeed(xmlContent);
+					logger.info(`Parsed ${items.length} items from RSS feed`);
+
+					const validRssItems = items.filter((item) => !isFilteredPostRss(item)).slice(0, maxPosts);
+
+					logger.info(`After filtering: ${validRssItems.length} valid posts`);
+
+					// Clear existing entries
+					store.clear();
+
+					// Process each RSS item
+					for (const item of validRssItems) {
+						try {
+							const content = item.description || "";
+							const cleanedContent = cleanContent(content);
+							const title = extractTitle(cleanedContent);
+
+							// Extract post ID from the GUID or link
+							const postId =
+								item.guid?.split("/").pop() ||
+								(typeof item.link === "string" ? item.link.split("/").pop() : null) ||
+								Math.random().toString(36);
+
+							// Create note entry
+							store.set({
+								id: `pleroma-${postId}`,
+								data: {
+									title,
+									description:
+										cleanedContent.substring(0, 160) + (cleanedContent.length > 160 ? "..." : ""),
+									publishDate: new Date(item.pubDate),
+								},
+								body: cleanedContent,
+								rendered: {
+									html: `<p>${cleanedContent.replace(/\n\n/g, "</p><p>")}</p>`,
+								},
+							});
+
+							logger.info(`Processed post: ${title.substring(0, 50)}...`);
+						} catch (error) {
+							logger.warn(`Failed to process RSS item ${item.guid}: ${error}`);
+						}
+					}
+				}
+
+				logger.info(`Successfully loaded ${validEntries.length} Pleroma posts`);
+			} catch (error) {
+				logger.warn(`Pleroma loader failed: ${error}`);
+				logger.info("Continuing build without Pleroma posts...");
+				// Don't throw error to prevent build failure
+				store.clear();
+			}
+		},
+	};
+}