summaryrefslogtreecommitdiff
path: root/src/loaders
diff options
context:
space:
mode:
Diffstat (limited to 'src/loaders')
-rw-r--r--src/loaders/pleroma.ts375
1 files changed, 375 insertions, 0 deletions
diff --git a/src/loaders/pleroma.ts b/src/loaders/pleroma.ts
new file mode 100644
index 0000000..dc6a05c
--- /dev/null
+++ b/src/loaders/pleroma.ts
@@ -0,0 +1,375 @@
+import type { Loader } from "astro/loaders";
+import { XMLParser } from "fast-xml-parser";
+import TurndownService from "turndown";
+
+interface PleromaFeedConfig {
+ instanceUrl: string;
+ username: string;
+ maxPosts?: number;
+ feedType?: "rss" | "atom";
+}
+
+interface RssItem {
+ guid: string;
+ title: string;
+ description: string;
+ pubDate: string;
+ link: string;
+ category?: string | string[];
+ "activity:object-type"?: string;
+ "activity:verb"?: string;
+ "thr:in-reply-to"?: {
+ "@_ref": string;
+ };
+}
+
+interface RssFeed {
+ rss: {
+ channel: {
+ title: string;
+ description: string;
+ link: string;
+ item?: RssItem | RssItem[];
+ };
+ };
+}
+
+interface AtomEntry {
+ id: string;
+ title: string;
+ content: {
+ "#text": string;
+ "@_type": string;
+ };
+ published: string;
+ updated: string;
+ link: {
+ "@_href": string;
+ "@_rel": string;
+ "@_type": string;
+ }[];
+ author: {
+ name: string;
+ uri: string;
+ };
+ category?: {
+ "@_term": string;
+ }[];
+ "activity:object-type"?: string;
+ "activity:verb"?: string;
+ "thr:in-reply-to"?: {
+ "@_ref": string;
+ };
+}
+
+interface AtomFeed {
+ feed: {
+ title: string;
+ id: string;
+ updated: string;
+ entry?: AtomEntry | AtomEntry[];
+ };
+}
+
+function parseAtomFeed(xmlContent: string): AtomEntry[] {
+ const parser = new XMLParser({
+ ignoreAttributes: false,
+ attributeNamePrefix: "@_",
+ parseAttributeValue: true,
+ });
+
+ const result: AtomFeed = parser.parse(xmlContent);
+
+ if (!result.feed?.entry) {
+ return [];
+ }
+
+ // Handle both single entry and array of entries
+ const entries = Array.isArray(result.feed.entry) ? result.feed.entry : [result.feed.entry];
+
+ return entries;
+}
+
+function parseRssFeed(xmlContent: string): RssItem[] {
+ const parser = new XMLParser({
+ ignoreAttributes: false,
+ attributeNamePrefix: "@_",
+ parseAttributeValue: true,
+ });
+
+ try {
+ const result: RssFeed = parser.parse(xmlContent);
+
+ if (!result.rss?.channel?.item) {
+ console.log("RSS structure:", JSON.stringify(result, null, 2));
+ return [];
+ }
+
+ // Handle both single item and array of items
+ const items = Array.isArray(result.rss.channel.item)
+ ? result.rss.channel.item
+ : [result.rss.channel.item];
+
+ return items;
+ } catch (error) {
+ console.error("Failed to parse RSS feed:", error);
+ console.log("XML content length:", xmlContent.length);
+ console.log("XML preview:", xmlContent.substring(0, 1000));
+ return [];
+ }
+}
+
+function isFilteredPostAtom(entry: AtomEntry): boolean {
+ // Filter out boosts/reblogs
+ if (entry["activity:verb"] === "http://activitystrea.ms/schema/1.0/share") {
+ return true;
+ }
+
+ // Filter out replies
+ if (entry["thr:in-reply-to"]) {
+ return true;
+ }
+
+ // Filter out NSFW/sensitive content
+ if (entry.category) {
+ const categories = Array.isArray(entry.category) ? entry.category : [entry.category];
+ const hasNsfwTag = categories.some(
+ (cat) =>
+ cat["@_term"]?.toLowerCase().includes("nsfw") ||
+ cat["@_term"]?.toLowerCase().includes("sensitive"),
+ );
+ if (hasNsfwTag) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+function isFilteredPostRss(item: RssItem): boolean {
+ // Filter out boosts/reblogs
+ if (item["activity:verb"] === "http://activitystrea.ms/schema/1.0/share") {
+ return true;
+ }
+
+ // Filter out replies
+ if (item["thr:in-reply-to"]) {
+ return true;
+ }
+
+ // Filter out NSFW/sensitive content
+ if (item.category) {
+ const categories = Array.isArray(item.category) ? item.category : [item.category];
+ const hasNsfwTag = categories.some(
+ (cat) => cat?.toLowerCase().includes("nsfw") || cat?.toLowerCase().includes("sensitive"),
+ );
+ if (hasNsfwTag) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+function cleanContent(htmlContent: string): string {
+ const turndownService = new TurndownService({
+ headingStyle: "atx",
+ codeBlockStyle: "fenced",
+ });
+
+ // Remove or replace common Pleroma/Mastodon elements
+ const cleanedContent = htmlContent
+ .replace(/<span class="[^"]*mention[^"]*"[^>]*>/gi, "") // Remove mention spans but keep content
+ .replace(/<\/span>/gi, "")
+ .replace(/<span class="[^"]*hashtag[^"]*"[^>]*>/gi, "") // Remove hashtag spans but keep content
+ .replace(/<span class="[^"]*ellipsis[^"]*"[^>]*>.*?<\/span>/gi, "") // Remove ellipsis
+ .replace(/<span class="[^"]*invisible[^"]*"[^>]*>.*?<\/span>/gi, ""); // Remove invisible text
+
+ // Convert to markdown
+ const markdown = turndownService.turndown(cleanedContent);
+
+ // Clean up extra whitespace
+ return markdown.trim().replace(/\n\s*\n\s*\n/g, "\n\n");
+}
+
+function extractTitle(content: string): string {
+ // Extract first line or first sentence as title
+ const firstLine = content.split("\n")[0];
+ if (!firstLine) return "Micro post";
+
+ const firstSentence = firstLine.split(/[.!?]/)[0];
+ if (!firstSentence) return "Micro post";
+
+ // Limit title length and clean it up
+ const title = (firstSentence.length > 60 ? `${firstSentence.substring(0, 57)}...` : firstSentence)
+ .replace(/[#*_`]/g, "") // Remove markdown formatting
+ .trim();
+
+ return title || "Micro post";
+}
+
+export function pleromaLoader(config: PleromaFeedConfig): Loader {
+ return {
+ name: "pleroma-loader",
+ load: async ({ store, logger }) => {
+ try {
+ const { instanceUrl, username, maxPosts = 20 } = config;
+ // Use RSS URL that redirects to Atom - this bypasses some access restrictions
+ const feedUrl = `${instanceUrl}/users/${username}.rss`;
+
+ logger.info(`Fetching Pleroma feed from: ${feedUrl}`);
+
+ // Add retry logic for network issues
+ let response: Response | undefined;
+ let lastError: unknown;
+
+ for (let attempt = 1; attempt <= 3; attempt++) {
+ try {
+ logger.info(`Attempt ${attempt} to fetch feed...`);
+
+ // Create timeout controller
+ const controller = new AbortController();
+ const timeoutId = setTimeout(() => controller.abort(), 10000);
+
+ response = await fetch(feedUrl, {
+ headers: {
+ "User-Agent": "Astro Blog (pleroma-loader)",
+ },
+ redirect: "follow", // Follow redirects
+ signal: controller.signal,
+ });
+
+ clearTimeout(timeoutId);
+
+ if (response.ok) {
+ break; // Success, exit retry loop
+ }
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
+ } catch (error) {
+ lastError = error;
+ logger.warn(`Attempt ${attempt} failed: ${error}`);
+
+ if (attempt < 3) {
+ logger.info("Retrying in 2 seconds...");
+ await new Promise((resolve) => setTimeout(resolve, 2000));
+ }
+ }
+ }
+
+ if (!response || !response.ok) {
+ logger.warn(`Failed to fetch Pleroma feed after 3 attempts. Last error: ${lastError}`);
+ logger.info("Continuing without Pleroma posts...");
+ store.clear();
+ return;
+ }
+
+ const xmlContent = await response.text();
+ logger.info(`Received XML content length: ${xmlContent.length}`);
+
+ // Auto-detect if it's Atom or RSS based on content
+ const isAtomFeed =
+ xmlContent.includes("<feed") ||
+ xmlContent.includes('xmlns="http://www.w3.org/2005/Atom"');
+ logger.info(`Detected feed type: ${isAtomFeed ? "Atom" : "RSS"}`);
+
+ let validEntries: AtomEntry[] = [];
+
+ if (isAtomFeed) {
+ // Process as Atom feed
+ const entries = parseAtomFeed(xmlContent);
+ logger.info(`Parsed ${entries.length} entries from Atom feed`);
+
+ validEntries = entries.filter((entry) => !isFilteredPostAtom(entry)).slice(0, maxPosts);
+
+ logger.info(`After filtering: ${validEntries.length} valid posts`);
+
+ // Clear existing entries
+ store.clear();
+
+ // Process each Atom entry
+ for (const entry of validEntries) {
+ try {
+ const content = entry.content?.["#text"] || "";
+ const cleanedContent = cleanContent(content);
+ const title = extractTitle(cleanedContent);
+
+ // Extract post ID from the entry ID
+ const postId = entry.id.split("/").pop() || entry.id;
+
+ // Create note entry
+ store.set({
+ id: `pleroma-${postId}`,
+ data: {
+ title,
+ description:
+ cleanedContent.substring(0, 160) + (cleanedContent.length > 160 ? "..." : ""),
+ publishDate: new Date(entry.published),
+ },
+ body: cleanedContent,
+ rendered: {
+ html: `<p>${cleanedContent.replace(/\n\n/g, "</p><p>")}</p>`,
+ },
+ });
+
+ logger.info(`Processed post: ${title.substring(0, 50)}...`);
+ } catch (error) {
+ logger.warn(`Failed to process entry ${entry.id}: ${error}`);
+ }
+ }
+ } else {
+ // Process as RSS feed
+ const items = parseRssFeed(xmlContent);
+ logger.info(`Parsed ${items.length} items from RSS feed`);
+
+ const validRssItems = items.filter((item) => !isFilteredPostRss(item)).slice(0, maxPosts);
+
+ logger.info(`After filtering: ${validRssItems.length} valid posts`);
+
+ // Clear existing entries
+ store.clear();
+
+ // Process each RSS item
+ for (const item of validRssItems) {
+ try {
+ const content = item.description || "";
+ const cleanedContent = cleanContent(content);
+ const title = extractTitle(cleanedContent);
+
+ // Extract post ID from the GUID or link
+ const postId =
+ item.guid?.split("/").pop() ||
+ (typeof item.link === "string" ? item.link.split("/").pop() : null) ||
+ Math.random().toString(36);
+
+ // Create note entry
+ store.set({
+ id: `pleroma-${postId}`,
+ data: {
+ title,
+ description:
+ cleanedContent.substring(0, 160) + (cleanedContent.length > 160 ? "..." : ""),
+ publishDate: new Date(item.pubDate),
+ },
+ body: cleanedContent,
+ rendered: {
+ html: `<p>${cleanedContent.replace(/\n\n/g, "</p><p>")}</p>`,
+ },
+ });
+
+ logger.info(`Processed post: ${title.substring(0, 50)}...`);
+ } catch (error) {
+ logger.warn(`Failed to process RSS item ${item.guid}: ${error}`);
+ }
+ }
+ }
+
+ logger.info(`Successfully loaded ${validEntries.length} Pleroma posts`);
+ } catch (error) {
+ logger.warn(`Pleroma loader failed: ${error}`);
+ logger.info("Continuing build without Pleroma posts...");
+ // Don't throw error to prevent build failure
+ store.clear();
+ }
+ },
+ };
+}