forked from sheetjs/docs.sheetjs.com
		
	
		
			
	
	
		
			71 lines
		
	
	
		
			2.2 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
		
		
			
		
	
	
			71 lines
		
	
	
		
			2.2 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
| 
								 | 
							
								import { Document } from "@langchain/core/documents";
							 | 
						||
| 
								 | 
							
								import { BufferLoader } from "langchain/document_loaders/fs/buffer";
							 | 
						||
| 
								 | 
							
								import { read, utils } from "xlsx";
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/**
							 | 
						||
| 
								 | 
							
								 * Document loader that uses SheetJS to load documents.
							 | 
						||
| 
								 | 
							
								 *
							 | 
						||
| 
								 | 
							
								 * Each worksheet is parsed into an array of row objects using the SheetJS
							 | 
						||
| 
								 | 
							
								 * `sheet_to_json` method and projected to a `Document`. Metadata includes
							 | 
						||
| 
								 | 
							
								 * original sheet name, row data, and row index
							 | 
						||
| 
								 | 
							
								 */
							 | 
						||
| 
								 | 
							
								export default class LoadOfSheet extends BufferLoader {
							 | 
						||
| 
								 | 
							
								  /** @type {import("langchain/chains/query_constructor").AttributeInfo[]}  */
							 | 
						||
| 
								 | 
							
								  attributes = [];
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  /**
							 | 
						||
| 
								 | 
							
								   * Document loader that uses SheetJS to load documents.
							 | 
						||
| 
								 | 
							
								   *
							 | 
						||
| 
								 | 
							
								   * @param {string|Blob} filePathOrBlob Source Data
							 | 
						||
| 
								 | 
							
								   */
							 | 
						||
| 
								 | 
							
								  constructor(filePathOrBlob) {
							 | 
						||
| 
								 | 
							
								    super(filePathOrBlob);
							 | 
						||
| 
								 | 
							
								    this.attributes = [];
							 | 
						||
| 
								 | 
							
								  }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								  /**
							 | 
						||
| 
								 | 
							
								   * Parse document
							 | 
						||
| 
								 | 
							
								   *
							 | 
						||
| 
								 | 
							
								   * NOTE: column labels in multiple sheets are not disambiguated!
							 | 
						||
| 
								 | 
							
								   *
							 | 
						||
| 
								 | 
							
								   * @param {Buffer} raw Raw data Buffer
							 | 
						||
| 
								 | 
							
								   * @param {Document["metadata"]} metadata Document metadata
							 | 
						||
| 
								 | 
							
								   * @returns {Promise<Document[]>} Array of Documents
							 | 
						||
| 
								 | 
							
								   */
							 | 
						||
| 
								 | 
							
								  async parse(raw, metadata) {
							 | 
						||
| 
								 | 
							
								    /** @type {Document[]} */
							 | 
						||
| 
								 | 
							
								    const result = [];
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    this.attributes = [
							 | 
						||
| 
								 | 
							
								      { name: "worksheet", description: "Sheet or Worksheet Name", type: "string" },
							 | 
						||
| 
								 | 
							
								      { name: "rowNum", description: "Row index", type: "number" }
							 | 
						||
| 
								 | 
							
								    ];
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    const wb = read(raw, {type: "buffer", WTF:1});
							 | 
						||
| 
								 | 
							
								    for(let name of wb.SheetNames) {
							 | 
						||
| 
								 | 
							
								      const fields = {};
							 | 
						||
| 
								 | 
							
								      const ws = wb.Sheets[name];
							 | 
						||
| 
								 | 
							
								      if(!ws) return;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								      const aoo = utils.sheet_to_json(ws);
							 | 
						||
| 
								 | 
							
								      aoo.forEach((row, idx) => {
							 | 
						||
| 
								 | 
							
								        result.push({
							 | 
						||
| 
								 | 
							
								          pageContent: "Row " + (idx + 1) + " has the following content: \n" + Object.entries(row).map(kv => `- ${kv[0]}: ${kv[1]}`).join("\n") + "\n",
							 | 
						||
| 
								 | 
							
								          metadata: {
							 | 
						||
| 
								 | 
							
								            worksheet: name,
							 | 
						||
| 
								 | 
							
								            rowNum: row["__rowNum__"],
							 | 
						||
| 
								 | 
							
								            ...metadata,
							 | 
						||
| 
								 | 
							
								            ...row
							 | 
						||
| 
								 | 
							
								          }
							 | 
						||
| 
								 | 
							
								        });
							 | 
						||
| 
								 | 
							
								        Object.entries(row).forEach(([k,v]) => { if(v != null) (fields[k] || (fields[k] = {}))[v instanceof Date ? "date" : typeof v] = true } );
							 | 
						||
| 
								 | 
							
								      });
							 | 
						||
| 
								 | 
							
								      Object.entries(fields).forEach(([k,v]) => this.attributes.push({
							 | 
						||
| 
								 | 
							
								        name: k, description: k, type: Object.keys(v).join(" or ")
							 | 
						||
| 
								 | 
							
								      }));
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    return result;
							 | 
						||
| 
								 | 
							
								  }
							 | 
						||
| 
								 | 
							
								};
							 |