-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathgenerate-llms.js
347 lines (297 loc) · 12.8 KB
/
generate-llms.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
const fs = require('fs');
const path = require('path');
const DOCS_DIR = path.join(__dirname, "../../docs");
const STATIC_DIR = path.join(__dirname, "../../static");
const OUTPUT_FILE = path.join(STATIC_DIR, "llms.txt");
const OUTPUT_FULL_FILE = path.join(STATIC_DIR, "llms-full.txt");
const BASE_URL = "https://docs.replicated.com";
// Define static header content
const STATIC_HEADER = `# Replicated Documentation
> Replicated is a commercial software distribution platform. Independent software vendors (ISVs) can use features of the Replicated Platform to distribute modern commercial software into complex, customer-controlled environments, including on-prem and air gap.
`;
// Define the specific files for the curated list
const INCLUDED_FILES = [
// Add specific file paths, relative to the docs directory
// Compatibility Matrix docs
'vendor/testing-about.md',
'vendor/testing-how-to.md',
'vendor/testing-supported-clusters.md',
// Embedded Cluster docs
'enterprise/embedded-manage-nodes.mdx',
'enterprise/installing-embedded-air-gap.mdx',
'enterprise/installing-embedded-automation.mdx',
'enterprise/installing-embedded-requirements.mdx',
'enterprise/installing-embedded.mdx',
'reference/embedded-cluster-install.mdx',
'vendor/embedded-overview.mdx',
// Helm Install docs
'vendor/helm-install-airgap.mdx',
'vendor/helm-install-overview.mdx',
'vendor/helm-install-release.md',
'vendor/install-with-helm.mdx',
'vendor/helm-install-values-schema.mdx',
// Intro and onboarding
'intro-replicated.mdx',
'vendor/kots-faq.mdx',
'vendor/quick-start.mdx',
'vendor/replicated-onboarding.mdx',
// KOTS CLI docs
'reference/kots-cli-getting-started.md',
// KOTS docs
'enterprise/installing-existing-cluster-airgapped.mdx',
'enterprise/installing-general-requirements.mdx',
'enterprise/snapshots-creating.md',
'enterprise/snapshots-restoring-full.mdx',
'enterprise/snapshots-velero-cli-installing.md',
'enterprise/updating-app-manager.mdx',
'reference/custom-resource-about.md',
'reference/custom-resource-application.mdx',
'reference/custom-resource-config.mdx',
'reference/custom-resource-helmchart-v2.mdx',
'reference/template-functions-about.mdx',
'reference/template-functions-examples.mdx',
'reference/template-functions-config-context.md',
'reference/template-functions-license-context.md',
'reference/template-functions-static-context.md',
'vendor/helm-native-about.mdx',
'vendor/helm-native-v2-using.md',
'vendor/helm-packaging-airgap-bundles.mdx',
'vendor/resources-annotations-templating.md',
'vendor/snapshots-overview.mdx',
// kURL docs
'vendor/kurl-about.mdx',
'enterprise/installing-kurl-requirements.mdx',
'enterprise/installing-kurl.mdx',
'enterprise/installing-kurl-airgap.mdx',
'vendor/packaging-embedded-kubernetes.mdx',
// Preflight checks and support bundles
'vendor/preflight-defining.mdx',
'vendor/preflight-examples.mdx',
'vendor/preflight-host-preflights.md',
'vendor/preflight-running.md',
'vendor/preflight-support-bundle-about.mdx',
'vendor/support-bundle-customizing.mdx',
'vendor/support-bundle-examples.mdx',
'vendor/support-bundle-generating.mdx',
// Proxy registry docs
'vendor/private-images-about.md',
'vendor/helm-image-registry.mdx',
'vendor/private-images-kots.mdx',
'vendor/packaging-public-images.mdx',
// Replicated CLI docs
'reference/replicated-cli-installing.mdx',
// Replicated SDK docs
'reference/replicated-sdk-apis.md',
'vendor/replicated-sdk-installing.mdx',
'vendor/replicated-sdk-overview.mdx',
'vendor/replicated-sdk-customizing.md',
// Vendor Portal docs
'vendor/custom-domains-using.md',
'vendor/custom-domains.md',
'vendor/custom-metrics.md',
'vendor/insights-app-status.md',
'vendor/instance-insights-event-data.mdx',
'vendor/licenses-about.mdx',
'vendor/licenses-adding-custom-fields.md',
'vendor/licenses-install-types.mdx',
'vendor/licenses-reference-sdk.mdx',
'vendor/releases-about.mdx',
'vendor/releases-creating-channels.md',
'vendor/releases-creating-cli.mdx',
'vendor/releases-share-download-portal.md',
'vendor/replicated-api-tokens.md',
'vendor/team-management-rbac-configuring.md',
'vendor/team-management-rbac-resource-names.md',
'vendor/team-management.md',
'vendor/telemetry-air-gap.mdx',
'vendor/vendor-portal-manage-app.md',
];
// Store partials content
const partialsCache = {};
// Load all partials from the docs directory
function loadPartials(dir) {
fs.readdirSync(dir, { withFileTypes: true }).forEach(entry => {
const fullPath = path.join(dir, entry.name);
if (entry.isDirectory() && !shouldSkipDirectory(fullPath)) {
loadPartials(fullPath);
} else if (entry.isFile() && entry.name.startsWith('_') &&
(entry.name.endsWith('.md') || entry.name.endsWith('.mdx'))) {
const content = fs.readFileSync(fullPath, 'utf8');
// Remove any front matter from the partial
const cleanContent = content.replace(/^---[\s\S]*?---/, '').trim();
// Store using the filename without extension as the key
const partialName = path.basename(entry.name, path.extname(entry.name)).substring(1);
partialsCache[partialName] = cleanContent;
}
});
}
// Process content to include partials
function processContent(content, filePath) {
// Extract only partial imports (those referencing the /partials directory)
const imports = [];
content = content.replace(/^import\s+(\w+)\s+from\s+["']([^"']*\/partials\/[^"']+)["']/gm, (match, importName, importPath) => {
imports.push({ name: importName, path: importPath });
return ''; // Remove import statement
});
// Other import statements will be left unchanged
content = content.replace(/^import.*$/gm, ''); // Remove remaining import statements
// Replace partial references with their content
imports.forEach(importInfo => {
const partialName = path.basename(importInfo.path, path.extname(importInfo.path)).substring(1);
if (partialsCache[partialName]) {
const regex = new RegExp(`<${importInfo.name}\\s*/>`, 'g');
content = content.replace(regex, partialsCache[partialName]);
} else {
console.warn(`Warning: Partial '${partialName}' not found for file ${filePath}`);
}
});
return content.trim();
}
function shouldSkipDirectory(filePath, excludedDirs = ['.history', 'templates', 'pdfs']) {
return excludedDirs.some(dir => filePath.includes(dir));
}
function getAllMarkdownFiles(dir, fileList = [], excludeReleaseNotes = true) {
fs.readdirSync(dir).forEach(file => {
const filePath = path.join(dir, file);
// Skip release-notes if excludeReleaseNotes is true
if (excludeReleaseNotes && filePath.includes('release-notes')) {
return;
}
if (shouldSkipDirectory(filePath)) {
return;
}
if (fs.statSync(filePath).isDirectory()) {
getAllMarkdownFiles(filePath, fileList, excludeReleaseNotes);
} else if ((path.extname(file) === '.md' || path.extname(file) === '.mdx') && !file.startsWith('_')) {
const content = fs.readFileSync(filePath, 'utf8');
// Process the content to include partials
const processedContent = processContent(content, filePath);
const titleMatch = processedContent.match(/^#\s+(.+)$/m);
const title = titleMatch ? titleMatch[1] : file.replace(/\.(md|mdx)$/, '');
const relativePath = filePath
.replace(`${DOCS_DIR}/`, '')
.replace(/\.(md|mdx)$/, '');
fileList.push({
path: relativePath,
title: title,
content: processedContent
});
}
});
return fileList;
}
// New function to get all markdown files including release-notes (only for static folder)
function getAllMarkdownFilesForStatic(dir, fileList = []) {
return getAllMarkdownFiles(dir, fileList, false);
}
function getCuratedFiles(dir) {
const fileList = [];
INCLUDED_FILES.forEach(relativePath => {
const filePath = path.join(dir, relativePath);
try {
const content = fs.readFileSync(filePath, 'utf8');
// Process the content to include partials
const processedContent = processContent(content, filePath);
const titleMatch = processedContent.match(/^#\s+(.+)$/m);
const title = titleMatch ? titleMatch[1] : path.basename(relativePath).replace(/\.(md|mdx)$/, '');
const description = extractFirstSentence(processedContent);
fileList.push({
path: relativePath.replace(/\.(md|mdx)$/, ''),
title: title,
description: description,
content: processedContent
});
} catch (error) {
console.warn(`Warning: Could not process file ${relativePath}: ${error.message}`);
}
});
return fileList;
}
// Get the description of the page from the first sentence
function extractFirstSentence(text) {
// Remove front matter
text = text.replace(/^---[\s\S]*?---/, '');
// Remove import statements
text = text.replace(/^import.*$/gm, '');
// Remove markdown headings
text = text.replace(/^#+\s.*$/gm, '');
// Find the first non-empty paragraph
const firstParagraph = text.split('\n')
.map(line => line.trim())
.filter(line => line.length > 0)[0];
if (!firstParagraph) return 'No description available.';
// Check if a period is likely the end of a sentence
function isEndOfSentence(text, periodIndex) {
// Check if period is inside a URL
if (text.lastIndexOf('http', periodIndex) > text.lastIndexOf(' ', periodIndex)) {
return false;
}
// Check if period is inside a markdown link
if (text.lastIndexOf('[', periodIndex) > text.lastIndexOf(']', periodIndex)) {
return false;
}
// Check if period is followed by a space or end of string
if (periodIndex < text.length - 1 && !/[\s\n]/.test(text[periodIndex + 1])) {
return false;
}
return true;
}
// Find the first real sentence ending
let index = 0;
while (index < firstParagraph.length) {
const char = firstParagraph[index];
if ('.!?'.includes(char) && isEndOfSentence(firstParagraph, index)) {
return firstParagraph.slice(0, index + 1).trim();
}
index++;
}
// If no sentence ending is found, return the whole paragraph
return firstParagraph.trim();
}
function generateFullLLMsTxt(files) {
const fullContent = files.map(file => {
return `${file.content}\n\n---\n\n`;
}).join('\n');
fs.writeFileSync(OUTPUT_FULL_FILE, fullContent);
console.log("✅ llms-full.txt generated!");
}
function copyProcessedMarkdownToStatic(files) {
files.forEach(file => {
// Add error checking
if (!file.content) {
console.warn(`Warning: No content found for file ${file.path}`);
return;
}
const staticPath = path.join(STATIC_DIR, `${file.path}.md`);
const staticDir = path.dirname(staticPath);
if (!fs.existsSync(staticDir)) {
fs.mkdirSync(staticDir, { recursive: true });
}
fs.writeFileSync(staticPath, file.content);
// console.log(`✅ Copied processed markdown to: ${file.path}.md`);
});
}
function generateLLMsTxt(files) {
const dynamicContent = [
"## Docs\n",
"For a complete archive of all documentation pages, see [llms-full.txt](https://docs.replicated.com/llms-full.txt)\n",
...files.map(file =>
`- [${file.title}](${BASE_URL}/${file.path}.md): ${file.description}`
)
].join('\n');
const fullContent = STATIC_HEADER + dynamicContent;
fs.writeFileSync(OUTPUT_FILE, fullContent);
console.log("✅ llms.txt generated!");
}
// Update the main execution
loadPartials(DOCS_DIR);
// Get files for llms-full.txt (excluding release-notes)
const allFiles = getAllMarkdownFiles(DOCS_DIR);
// Get all files including release-notes for copying to static
const allFilesForStatic = getAllMarkdownFilesForStatic(DOCS_DIR);
const curatedFiles = getCuratedFiles(DOCS_DIR);
// Generate llms-full.txt (excluding release-notes)
generateFullLLMsTxt(allFiles);
// Copy all files including release-notes to static
copyProcessedMarkdownToStatic(allFilesForStatic);
generateLLMsTxt(curatedFiles);