forked from sheetjs/docs.sheetjs.com
		
	
		
			
				
	
	
		
			332 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			332 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| ---
 | |
| title: Typed Arrays and ML
 | |
| pagination_prev: demos/extensions/index
 | |
| pagination_next: demos/engines/index
 | |
| sidebar_custom_props:
 | |
|   summary: Parse and serialize Uint8Array data from TensorFlow
 | |
| ---
 | |
| 
 | |
| <head>
 | |
|   <script src="https://docs.sheetjs.com/tfjs/tf.min.js"></script>
 | |
| </head>
 | |
| 
 | |
| Machine learning libraries in JS typically use "Typed Arrays". Typed Arrays are
 | |
| not JS Arrays! With some data wrangling, translating between SheetJS worksheets
 | |
| and typed arrays is straightforward.
 | |
| 
 | |
| This demo covers conversions between worksheets and Typed Arrays for use with
 | |
| TensorFlow.js and other ML libraries.
 | |
| 
 | |
| :::note
 | |
| 
 | |
| Live code blocks in this page load the standalone build from version `3.18.0`.
 | |
| 
 | |
| For use in web frameworks, the `@tensorflow/tfjs` module should be used.
 | |
| 
 | |
| For use in NodeJS, the native bindings module is `@tensorflow/tfjs-node`.
 | |
| 
 | |
| :::
 | |
| 
 | |
| ## CSV Data Interchange
 | |
| 
 | |
| `tf.data.csv` generates a Dataset from CSV data.  The function expects a URL.
 | |
| Fortunately blob URLs are supported, making data import straightforward:
 | |
| 
 | |
| ```js
 | |
| function worksheet_to_csv_url(worksheet) {
 | |
|   /* generate CSV */
 | |
|   const csv = XLSX.utils.sheet_to_csv(worksheet);
 | |
| 
 | |
|   /* CSV -> Uint8Array -> Blob */
 | |
|   const u8 = new TextEncoder().encode(csv);
 | |
|   const blob = new Blob([u8], { type: "text/csv" });
 | |
| 
 | |
|   /* generate a blob URL */
 | |
|   return URL.createObjectURL(blob);
 | |
| }
 | |
| ```
 | |
| 
 | |
| [This demo mirrors `TFjs` docs](https://js.tensorflow.org/api/latest/#data.csv),
 | |
| fetching [an XLSX export of the example dataset](https://sheetjs.com/data/bht.xlsx).
 | |
| 
 | |
| <details><summary><b>TF CSV Demo using XLSX files</b> (click to show)</summary>
 | |
| 
 | |
| :::caution
 | |
| 
 | |
| If the live demo shows a message
 | |
| 
 | |
| ```
 | |
| ReferenceError: tf is not defined
 | |
| ```
 | |
| 
 | |
| please refresh the page.  This is a known bug in the documentation generator.
 | |
| 
 | |
| :::
 | |
| 
 | |
| ```jsx live
 | |
| function SheetJSToTFJSCSV() {
 | |
|   const [output, setOutput] = React.useState("");
 | |
|   const doit = React.useCallback(async () => {
 | |
|     /* fetch file */
 | |
|     const f = await fetch("https://sheetjs.com/data/bht.xlsx");
 | |
|     const ab = await f.arrayBuffer();
 | |
|     /* parse file and get first worksheet */
 | |
|     const wb = XLSX.read(ab);
 | |
|     const ws = wb.Sheets[wb.SheetNames[0]];
 | |
| 
 | |
|     /* generate CSV */
 | |
|     const csv = XLSX.utils.sheet_to_csv(ws);
 | |
| 
 | |
|     /* generate blob URL */
 | |
|     const u8 = new TextEncoder().encode(csv);
 | |
|     const blob = new Blob([u8], {type: "text/csv"});
 | |
|     const url = URL.createObjectURL(blob);
 | |
| 
 | |
|     /* feed to tfjs */
 | |
|     const dataset = tf.data.csv(url, {columnConfigs:{"medv":{isLabel:true}}});
 | |
| 
 | |
|     /* this part mirrors the tf.data.csv docs */
 | |
|     const flat = dataset.map(({xs,ys}) => ({xs: Object.values(xs), ys: Object.values(ys)})).batch(10);
 | |
|     const model = tf.sequential();
 | |
|     model.add(tf.layers.dense({inputShape: [(await dataset.columnNames()).length - 1], units: 1}));
 | |
|     model.compile({ optimizer: tf.train.sgd(0.000001), loss: 'meanSquaredError' });
 | |
|     let base = output;
 | |
|     await model.fitDataset(flat, { epochs: 10, callbacks: { onEpochEnd: async (epoch, logs) => {
 | |
|       setOutput(base += "\n" + epoch + ":" + logs.loss);
 | |
|     }}});
 | |
|     model.summary();
 | |
|   });
 | |
|   return ( <pre><b><a href="https://js.tensorflow.org/api/latest/#data.csv">Original CSV demo</a></b><br/><br/>
 | |
|     <button onClick={doit}>Click to run</button>
 | |
|     {output}
 | |
|   </pre> );
 | |
| }
 | |
| ```
 | |
| 
 | |
| </details>
 | |
| 
 | |
| In the other direction, `XLSX.read` will readily parse CSV exports.
 | |
| 
 | |
| ## JS Array Interchange
 | |
| 
 | |
| [The official Linear Regression tutorial](https://www.tensorflow.org/js/tutorials/training/linear_regression)
 | |
| loads data from a JSON file:
 | |
| 
 | |
| ```json
 | |
| [
 | |
|   {
 | |
|     "Name": "chevrolet chevelle malibu",
 | |
|     "Miles_per_Gallon": 18,
 | |
|     "Cylinders": 8,
 | |
|     "Displacement": 307,
 | |
|     "Horsepower": 130,
 | |
|     "Weight_in_lbs": 3504,
 | |
|     "Acceleration": 12,
 | |
|     "Year": "1970-01-01",
 | |
|     "Origin": "USA"
 | |
|   },
 | |
|   {
 | |
|     "Name": "buick skylark 320",
 | |
|     "Miles_per_Gallon": 15,
 | |
|     "Cylinders": 8,
 | |
|     "Displacement": 350,
 | |
|     "Horsepower": 165,
 | |
|     "Weight_in_lbs": 3693,
 | |
|     "Acceleration": 11.5,
 | |
|     "Year": "1970-01-01",
 | |
|     "Origin": "USA"
 | |
|   },
 | |
|   // ...
 | |
| ]
 | |
| ```
 | |
| 
 | |
| In real use cases, data is stored in [spreadsheets](https://sheetjs.com/data/cd.xls)
 | |
| 
 | |
| 
 | |
| 
 | |
| Following the tutorial, the data fetching method is easily adapted. Differences
 | |
| from the official example are highlighted below:
 | |
| 
 | |
| ```js
 | |
| /**
 | |
|  * Get the car data reduced to just the variables we are interested
 | |
|  * and cleaned of missing data.
 | |
|  */
 | |
| async function getData() {
 | |
|   // highlight-start
 | |
|   /* fetch file */
 | |
|   const carsDataResponse = await fetch('https://sheetjs.com/data/cd.xls');
 | |
|   /* get file data (ArrayBuffer) */
 | |
|   const carsDataAB = await carsDataResponse.arrayBuffer();
 | |
|   /* parse */
 | |
|   const carsDataWB = XLSX.read(carsDataAB);
 | |
|   /* get first worksheet */
 | |
|   const carsDataWS = carsDataWB.Sheets[carsDataWB.SheetNames[0]];
 | |
|   /* generate array of JS objects */
 | |
|   const carsData = XLSX.utils.sheet_to_json(carsDataWS);
 | |
|   // highlight-end
 | |
|   const cleaned = carsData.map(car => ({
 | |
|     mpg: car.Miles_per_Gallon,
 | |
|     horsepower: car.Horsepower,
 | |
|   }))
 | |
|   .filter(car => (car.mpg != null && car.horsepower != null));
 | |
| 
 | |
|   return cleaned;
 | |
| }
 | |
| ```
 | |
| 
 | |
| ## Low-Level Operations
 | |
| 
 | |
| :::caution
 | |
| 
 | |
| While it is more efficient to use low-level operations, JS or CSV interchange
 | |
| is strongly recommended when possible.
 | |
| 
 | |
| :::
 | |
| 
 | |
| ### Data Transposition
 | |
| 
 | |
| A typical dataset in a spreadsheet will start with one header row and represent
 | |
| each data record in its own row.  For example, the Iris dataset might look like
 | |
| 
 | |
| 
 | |
| 
 | |
| `XLSX.utils.sheet_to_json` will translate this into an array of row objects:
 | |
| 
 | |
| ```js
 | |
| var aoo = [
 | |
|   {"sepal length": 5.1, "sepal width": 3.5, ...},
 | |
|   {"sepal length": 4.9, "sepal width":   3, ...},
 | |
|   ...
 | |
| ];
 | |
| ```
 | |
| 
 | |
| TF.js and other libraries tend to operate on individual columns, equivalent to:
 | |
| 
 | |
| ```js
 | |
| var sepal_lengths = [5.1, 4.9, ...];
 | |
| var sepal_widths = [3.5, 3, ...];
 | |
| ```
 | |
| 
 | |
| When a `tensor2d` can be exported, it will look different from the spreadsheet:
 | |
| 
 | |
| ```js
 | |
| var data_set_2d = [
 | |
|   [5.1, 4.9, ...],
 | |
|   [3.5, 3, ...],
 | |
|   ...
 | |
| ]
 | |
| ```
 | |
| 
 | |
| This is the transpose of how people use spreadsheets!
 | |
| 
 | |
| #### Typed Arrays and Columns
 | |
| 
 | |
| A single typed array can be converted to a pure JS array with `Array.from`:
 | |
| 
 | |
| ```js
 | |
| var column = Array.from(dataset_typedarray);
 | |
| ```
 | |
| 
 | |
| Similarly, `Float32Array.from` generates a typed array from a normal array:
 | |
| 
 | |
| ```js
 | |
| var dataset = Float32Array.from(column);
 | |
| ```
 | |
| 
 | |
| ### Exporting Datasets to a Worksheet
 | |
| 
 | |
| `XLSX.utils.aoa_to_sheet` can generate a worksheet from an array of arrays.
 | |
| ML libraries typically provide APIs to pull an array of arrays, but it will
 | |
| be transposed. To export multiple data sets, manually "transpose" the data:
 | |
| 
 | |
| ```js
 | |
| /* assuming data is an array of typed arrays */
 | |
| var aoa = [];
 | |
| for(var i = 0; i < data.length; ++i) {
 | |
|   for(var j = 0; j < data[i].length; ++j) {
 | |
|     if(!aoa[j]) aoa[j] = [];
 | |
|     aoa[j][i] = data[i][j];
 | |
|   }
 | |
| }
 | |
| /* aoa can be directly converted to a worksheet object */
 | |
| var ws = XLSX.utils.aoa_to_sheet(aoa);
 | |
| ```
 | |
| 
 | |
| ### Importing Data from a Spreadsheet
 | |
| 
 | |
| `sheet_to_json` with the option `header:1` will generate a row-major array of
 | |
| arrays that can be transposed.  However, it is more efficient to walk the sheet
 | |
| manually:
 | |
| 
 | |
| ```js
 | |
| /* find worksheet range */
 | |
| var range = XLSX.utils.decode_range(ws['!ref']);
 | |
| var out = []
 | |
| /* walk the columns */
 | |
| for(var C = range.s.c; C <= range.e.c; ++C) {
 | |
|   /* create the typed array */
 | |
|   var ta = new Float32Array(range.e.r - range.s.r + 1);
 | |
|   /* walk the rows */
 | |
|   for(var R = range.s.r; R <= range.e.r; ++R) {
 | |
|     /* find the cell, skip it if the cell isn't numeric or boolean */
 | |
|     var cell = ws[XLSX.utils.encode_cell({r:R, c:C})];
 | |
|     if(!cell || cell.t != 'n' && cell.t != 'b') continue;
 | |
|     /* assign to the typed array */
 | |
|     ta[R - range.s.r] = cell.v;
 | |
|   }
 | |
|   out.push(ta);
 | |
| }
 | |
| ```
 | |
| 
 | |
| If the data set has a header row, the loop can be adjusted to skip those rows.
 | |
| 
 | |
| ### TF.js Tensors
 | |
| 
 | |
| A single `Array#map` can pull individual named fields from the result, which
 | |
| can be used to construct TensorFlow.js tensor objects:
 | |
| 
 | |
| ```js
 | |
| const aoo = XLSX.utils.sheet_to_json(worksheet);
 | |
| const lengths = aoo.map(row => row["sepal length"]);
 | |
| const tensor = tf.tensor1d(lengths);
 | |
| ```
 | |
| 
 | |
| `tf.Tensor` objects can be directly transposed using `transpose`:
 | |
| 
 | |
| ```js
 | |
| var aoo = XLSX.utils.sheet_to_json(worksheet);
 | |
| // "x" and "y" are the fields we want to pull from the data
 | |
| var data = aoo.map(row => ([row["x"], row["y"]]));
 | |
| 
 | |
| // create a tensor representing two column datasets
 | |
| var tensor = tf.tensor2d(data).transpose();
 | |
| 
 | |
| // individual columns can be accessed
 | |
| var col1 = tensor.slice([0,0], [1,tensor.shape[1]]).flatten();
 | |
| var col2 = tensor.slice([1,0], [1,tensor.shape[1]]).flatten();
 | |
| ```
 | |
| 
 | |
| For exporting, `stack` can be used to collapse the columns into a linear array:
 | |
| 
 | |
| ```js
 | |
| /* pull data into a Float32Array */
 | |
| var result = tf.stack([col1, col2]).transpose();
 | |
| var shape = tensor.shape;
 | |
| var f32 = tensor.dataSync();
 | |
| 
 | |
| /* construct an array of arrays of the data in spreadsheet order */
 | |
| var aoa = [];
 | |
| for(var j = 0; j < shape[0]; ++j) {
 | |
|   aoa[j] = [];
 | |
|   for(var i = 0; i < shape[1]; ++i) aoa[j][i] = f32[j * shape[1] + i];
 | |
| }
 | |
| 
 | |
| /* add headers to the top */
 | |
| aoa.unshift(["x", "y"]);
 | |
| 
 | |
| /* generate worksheet */
 | |
| var worksheet = XLSX.utils.aoa_to_sheet(aoa);
 | |
| ```
 | |
| 
 |