Convert to Markdown - NPM Package Usage Guide
Overview
The convert-to-markdown
package provides a simple, programmatic way to convert Excel (.xlsx, .xls, .xlsm), PDF, and Word (.docx, .dotx, .dotm) documents into clean, AI-ready formats like Markdown and JSON. This guide covers how to use the package in your Node.js applications.
Installation
npm install @knowcode/convert-to-markdown
Quick Start
const ConvertToMarkdown = require('@knowcode/convert-to-markdown');
// or using ES modules
import ConvertToMarkdown from '@knowcode/convert-to-markdown';
// Convert an Excel file (.xlsx, .xls, .xlsm) to JSON
const result = await ConvertToMarkdown.excelToJson('data.xlsx');
const data = JSON.parse(result.content);
console.log(data);
API Reference
Main Class: ConvertToMarkdown
The package exports a main class with static methods for each conversion type:
Excel (.xlsx, .xls, .xlsm) Conversions
ConvertToMarkdown.excelToJson(input, options)
Converts Excel files (.xlsx, .xls, .xlsm) to JSON format with resolved formulas.
// From file path
const result = await ConvertToMarkdown.excelToJson('sales-report.xlsx');
// From buffer
const buffer = fs.readFileSync('sales-report.xlsx');
const result = await ConvertToMarkdown.excelToJson(buffer, {
filename: 'sales-report.xlsx'
});
// With sheet filtering
const result = await ConvertToMarkdown.excelToJson('data.xlsx', {
sheetPrefix: 'Sales' // Only process sheets starting with "Sales"
});
// Access the data
const jsonData = JSON.parse(result.content);
console.log(result.statistics); // Conversion statistics
ConvertToMarkdown.excelToMarkdown(input, options)
Converts Excel spreadsheets (.xlsx, .xls, .xlsm) to Markdown tables.
const result = await ConvertToMarkdown.excelToMarkdown('data.xlsx');
console.log(result.document); // Markdown with tables
console.log(result.stats); // Statistics
PDF Conversions
ConvertToMarkdown.pdfToMarkdown(input, options)
Extracts text from PDFs and formats as Markdown.
const result = await ConvertToMarkdown.pdfToMarkdown('document.pdf');
console.log(result.document); // Markdown content
console.log(result.stats.numberOfPages); // Page count
Word (.docx, .dotx, .dotm) Conversions
ConvertToMarkdown.wordToHtml(input, options)
Converts Word (.docx, .dotx, .dotm) documents to clean, semantic HTML.
const result = await ConvertToMarkdown.wordToHtml('report.docx');
console.log(result.html); // Clean HTML
ConvertToMarkdown.wordToMarkdown(input, options)
Converts Word (.docx, .dotx, .dotm) documents to Markdown with full formatting.
const result = await ConvertToMarkdown.wordToMarkdown('guide.docx');
console.log(result.markdown); // Markdown content
console.log(result.statistics.numberOfHeadings); // Heading count
Direct Function Access
For more control, you can access converter functions directly:
const { excel, pdf, word } = require('@knowcode/convert-to-markdown');
// Direct function calls (buffer input only)
const buffer = fs.readFileSync('data.xlsx');
const result = await excel.toJson(buffer, { filename: 'data.xlsx' });
Utility Functions
Access helpful utilities for custom processing:
const { utils } = require('@knowcode/convert-to-markdown');
// Estimate tokens for LLM usage
const tokens = utils.estimateTokens('Your text here');
// Convert data to markdown table
const table = utils.dataToMarkdownTable([
{ name: 'John', age: 30 },
{ name: 'Jane', age: 25 }
]);
// Clean text for markdown
const cleaned = utils.cleanTextForMarkdown('Text with "smart quotes"');
Complete Examples
Example 1: Batch Convert Excel Files (.xlsx, .xls, .xlsm)
const ConvertToMarkdown = require('@knowcode/convert-to-markdown');
const fs = require('fs').promises;
const path = require('path');
async function batchConvertExcel(directory) {
const files = await fs.readdir(directory);
const excelFiles = files.filter(f => f.endsWith('.xlsx'));
for (const file of excelFiles) {
const filePath = path.join(directory, file);
const result = await ConvertToMarkdown.excelToMarkdown(filePath);
// Save markdown file
const mdPath = filePath.replace('.xlsx', '.md');
await fs.writeFile(mdPath, result.document);
console.log(`Converted ${file}: ${result.stats.estimatedTokens} tokens`);
}
}
batchConvertExcel('./spreadsheets');
Example 2: Process PDF with Error Handling
async function processPDF(pdfPath) {
try {
const result = await ConvertToMarkdown.pdfToMarkdown(pdfPath);
// Check if content is too large for your use case
if (result.stats.estimatedTokens > 10000) {
console.warn('Document is very large:', result.stats.estimatedTokens, 'tokens');
}
return result.document;
} catch (error) {
console.error('PDF conversion failed:', error.message);
throw error;
}
}
Example 3: Convert and Analyze Word (.docx, .dotx, .dotm) Documents
async function analyzeWordDoc(docPath) {
const result = await ConvertToMarkdown.wordToMarkdown(docPath);
console.log('Document Analysis:');
console.log('- Headings:', result.statistics.numberOfHeadings);
console.log('- Paragraphs:', result.statistics.numberOfParagraphs);
console.log('- Lists:', result.statistics.numberOfLists);
console.log('- Links:', result.statistics.numberOfLinks);
console.log('- Estimated reading time:',
Math.ceil(result.statistics.estimatedTokens / 200), 'minutes');
return result.markdown;
}
Example 4: Excel (.xlsx, .xls, .xlsm) Data Processing Pipeline
async function processExcelData(filePath) {
// Convert to JSON
const jsonResult = await ConvertToMarkdown.excelToJson(filePath);
const data = JSON.parse(jsonResult.content);
// Process each sheet
for (const [sheetName, rows] of Object.entries(data)) {
console.log(`Processing ${sheetName}: ${rows.length} rows`);
// Your data processing logic here
const processed = rows.map(row => ({
...row,
processed: true,
timestamp: new Date().toISOString()
}));
// Convert back to markdown for reporting
const { utils } = require('@knowcode/convert-to-markdown');
const markdown = utils.dataToMarkdownTable(processed);
console.log(markdown);
}
}
Example 5: Stream Processing Large Files
const { createReadStream } = require('fs');
async function streamConvert(filePath) {
// For very large files, read into buffer first
const chunks = [];
const stream = createReadStream(filePath);
for await (const chunk of stream) {
chunks.push(chunk);
}
const buffer = Buffer.concat(chunks);
const result = await ConvertToMarkdown.excelToJson(buffer, {
filename: path.basename(filePath)
});
return result;
}
TypeScript Usage
The package includes TypeScript definitions:
import ConvertToMarkdown, {
ExcelJsonResult,
ConversionOptions
} from '@knowcode/convert-to-markdown';
async function convertWithTypes(filePath: string): Promise<void> {
const options: ConversionOptions = {
filename: 'report.xlsx',
sheetPrefix: 'Data'
};
const result: ExcelJsonResult = await ConvertToMarkdown.excelToJson(
filePath,
options
);
console.log(result.statistics.estimatedTokens);
}
Performance Tips
- Buffer vs File Path: Using buffers is slightly faster if you already have the file in memory
- Large Files: The package handles files up to 5MB efficiently
- Memory Usage: Each conversion loads the entire file into memory
- Token Estimation: Use
estimatedTokens
to gauge LLM API costs
Error Handling
All methods return promises that may reject with errors:
try {
const result = await ConvertToMarkdown.excelToJson('data.xlsx');
} catch (error) {
if (error.message.includes('ENOENT')) {
console.error('File not found');
} else if (error.message.includes('File size too large')) {
console.error('File exceeds 5MB limit');
} else {
console.error('Conversion failed:', error.message);
}
}
Common Use Cases
1. Preparing Data for LLMs
async function prepareForLLM(docPath) {
const result = await ConvertToMarkdown.wordToMarkdown(docPath);
if (result.statistics.estimatedTokens > 4000) {
// Document too large, need to chunk
const lines = result.markdown.split('\n');
const chunks = [];
let currentChunk = '';
for (const line of lines) {
if (utils.estimateTokens(currentChunk + line) > 3000) {
chunks.push(currentChunk);
currentChunk = line;
} else {
currentChunk += '\n' + line;
}
}
return chunks;
}
return [result.markdown];
}
2. Data Analysis from Excel (.xlsx, .xls, .xlsm)
async function analyzeExcelData(filePath) {
const result = await ConvertToMarkdown.excelToJson(filePath);
const data = JSON.parse(result.content);
// Calculate statistics per sheet
const analysis = {};
for (const [sheet, rows] of Object.entries(data)) {
analysis[sheet] = {
rowCount: rows.length,
columnCount: rows[0] ? Object.keys(rows[0]).length : 0,
hasEmptyRows: rows.some(row => Object.values(row).every(v => !v))
};
}
return analysis;
}
3. Document Migration
async function migrateDocuments(sourceDir, targetDir) {
const files = await fs.readdir(sourceDir);
for (const file of files) {
const ext = path.extname(file).toLowerCase();
const baseName = path.basename(file, ext);
let result;
switch (ext) {
case '.xlsx':
case '.xls':
result = await ConvertToMarkdown.excelToMarkdown(
path.join(sourceDir, file)
);
await fs.writeFile(
path.join(targetDir, `${baseName}.md`),
result.document
);
break;
case '.pdf':
result = await ConvertToMarkdown.pdfToMarkdown(
path.join(sourceDir, file)
);
await fs.writeFile(
path.join(targetDir, `${baseName}.md`),
result.document
);
break;
case '.docx':
result = await ConvertToMarkdown.wordToMarkdown(
path.join(sourceDir, file)
);
await fs.writeFile(
path.join(targetDir, `${baseName}.md`),
result.markdown
);
break;
}
}
}
Limitations
- Maximum file size: 5MB (for optimal performance)
- PDF conversion: Works best with text-based PDFs (not scanned images)
- Excel (.xlsx, .xls, .xlsm) formulas: Resolved to their calculated values
- Word (.docx, .dotx, .dotm) formatting: Complex layouts may be simplified
Contributing
Issues and pull requests are welcome at the GitHub repository.
License
MIT