forked from sheetjs/docs.sheetjs.com
		
	
		
			
				
	
	
		
			71 lines
		
	
	
		
			2.2 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
			
		
		
	
	
			71 lines
		
	
	
		
			2.2 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
| import { Document } from "@langchain/core/documents";
 | |
| import { BufferLoader } from "langchain/document_loaders/fs/buffer";
 | |
| import { read, utils } from "xlsx";
 | |
| 
 | |
| /**
 | |
|  * Document loader that uses SheetJS to load documents.
 | |
|  *
 | |
|  * Each worksheet is parsed into an array of row objects using the SheetJS
 | |
|  * `sheet_to_json` method and projected to a `Document`. Metadata includes
 | |
|  * original sheet name, row data, and row index
 | |
|  */
 | |
| export default class LoadOfSheet extends BufferLoader {
 | |
|   /** @type {import("langchain/chains/query_constructor").AttributeInfo[]}  */
 | |
|   attributes = [];
 | |
| 
 | |
|   /**
 | |
|    * Document loader that uses SheetJS to load documents.
 | |
|    *
 | |
|    * @param {string|Blob} filePathOrBlob Source Data
 | |
|    */
 | |
|   constructor(filePathOrBlob) {
 | |
|     super(filePathOrBlob);
 | |
|     this.attributes = [];
 | |
|   }
 | |
| 
 | |
|   /**
 | |
|    * Parse document
 | |
|    *
 | |
|    * NOTE: column labels in multiple sheets are not disambiguated!
 | |
|    *
 | |
|    * @param {Buffer} raw Raw data Buffer
 | |
|    * @param {Document["metadata"]} metadata Document metadata
 | |
|    * @returns {Promise<Document[]>} Array of Documents
 | |
|    */
 | |
|   async parse(raw, metadata) {
 | |
|     /** @type {Document[]} */
 | |
|     const result = [];
 | |
| 
 | |
|     this.attributes = [
 | |
|       { name: "worksheet", description: "Sheet or Worksheet Name", type: "string" },
 | |
|       { name: "rowNum", description: "Row index", type: "number" }
 | |
|     ];
 | |
| 
 | |
|     const wb = read(raw, {type: "buffer", WTF:1});
 | |
|     for(let name of wb.SheetNames) {
 | |
|       const fields = {};
 | |
|       const ws = wb.Sheets[name];
 | |
|       if(!ws) return;
 | |
| 
 | |
|       const aoo = utils.sheet_to_json(ws);
 | |
|       aoo.forEach((row, idx) => {
 | |
|         result.push({
 | |
|           pageContent: "Row " + (idx + 1) + " has the following content: \n" + Object.entries(row).map(kv => `- ${kv[0]}: ${kv[1]}`).join("\n") + "\n",
 | |
|           metadata: {
 | |
|             worksheet: name,
 | |
|             rowNum: row["__rowNum__"],
 | |
|             ...metadata,
 | |
|             ...row
 | |
|           }
 | |
|         });
 | |
|         Object.entries(row).forEach(([k,v]) => { if(v != null) (fields[k] || (fields[k] = {}))[v instanceof Date ? "date" : typeof v] = true } );
 | |
|       });
 | |
|       Object.entries(fields).forEach(([k,v]) => this.attributes.push({
 | |
|         name: k, description: k, type: Object.keys(v).join(" or ")
 | |
|       }));
 | |
|     }
 | |
| 
 | |
|     return result;
 | |
|   }
 | |
| };
 |