forked from sheetjs/docs.sheetjs.com
		
	
		
			
	
	
		
			71 lines
		
	
	
		
			2.2 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
		
		
			
		
	
	
			71 lines
		
	
	
		
			2.2 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
|  | import { Document } from "@langchain/core/documents"; | ||
|  | import { BufferLoader } from "langchain/document_loaders/fs/buffer"; | ||
|  | import { read, utils } from "xlsx"; | ||
|  | 
 | ||
|  | /** | ||
|  |  * Document loader that uses SheetJS to load documents. | ||
|  |  * | ||
|  |  * Each worksheet is parsed into an array of row objects using the SheetJS | ||
|  |  * `sheet_to_json` method and projected to a `Document`. Metadata includes | ||
|  |  * original sheet name, row data, and row index | ||
|  |  */ | ||
|  | export default class LoadOfSheet extends BufferLoader { | ||
|  |   /** @type {import("langchain/chains/query_constructor").AttributeInfo[]}  */ | ||
|  |   attributes = []; | ||
|  | 
 | ||
|  |   /** | ||
|  |    * Document loader that uses SheetJS to load documents. | ||
|  |    * | ||
|  |    * @param {string|Blob} filePathOrBlob Source Data | ||
|  |    */ | ||
|  |   constructor(filePathOrBlob) { | ||
|  |     super(filePathOrBlob); | ||
|  |     this.attributes = []; | ||
|  |   } | ||
|  | 
 | ||
|  |   /** | ||
|  |    * Parse document | ||
|  |    * | ||
|  |    * NOTE: column labels in multiple sheets are not disambiguated! | ||
|  |    * | ||
|  |    * @param {Buffer} raw Raw data Buffer | ||
|  |    * @param {Document["metadata"]} metadata Document metadata | ||
|  |    * @returns {Promise<Document[]>} Array of Documents | ||
|  |    */ | ||
|  |   async parse(raw, metadata) { | ||
|  |     /** @type {Document[]} */ | ||
|  |     const result = []; | ||
|  | 
 | ||
|  |     this.attributes = [ | ||
|  |       { name: "worksheet", description: "Sheet or Worksheet Name", type: "string" }, | ||
|  |       { name: "rowNum", description: "Row index", type: "number" } | ||
|  |     ]; | ||
|  | 
 | ||
|  |     const wb = read(raw, {type: "buffer", WTF:1}); | ||
|  |     for(let name of wb.SheetNames) { | ||
|  |       const fields = {}; | ||
|  |       const ws = wb.Sheets[name]; | ||
|  |       if(!ws) return; | ||
|  | 
 | ||
|  |       const aoo = utils.sheet_to_json(ws); | ||
|  |       aoo.forEach((row, idx) => { | ||
|  |         result.push({ | ||
|  |           pageContent: "Row " + (idx + 1) + " has the following content: \n" + Object.entries(row).map(kv => `- ${kv[0]}: ${kv[1]}`).join("\n") + "\n", | ||
|  |           metadata: { | ||
|  |             worksheet: name, | ||
|  |             rowNum: row["__rowNum__"], | ||
|  |             ...metadata, | ||
|  |             ...row | ||
|  |           } | ||
|  |         }); | ||
|  |         Object.entries(row).forEach(([k,v]) => { if(v != null) (fields[k] || (fields[k] = {}))[v instanceof Date ? "date" : typeof v] = true } ); | ||
|  |       }); | ||
|  |       Object.entries(fields).forEach(([k,v]) => this.attributes.push({ | ||
|  |         name: k, description: k, type: Object.keys(v).join(" or ") | ||
|  |       })); | ||
|  |     } | ||
|  | 
 | ||
|  |     return result; | ||
|  |   } | ||
|  | }; |