| 
									
										
										
										
											2022-07-08 00:04:16 +00:00
										 |  |  | --- | 
					
						
							|  |  |  | sidebar_position: 8 | 
					
						
							|  |  |  | title: Typed Arrays and ML | 
					
						
							|  |  |  | --- | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | <head> | 
					
						
							|  |  |  |   <script src="https://unpkg.com/@tensorflow/tfjs@3.18.0/dist/tf.min.js"></script> | 
					
						
							|  |  |  | </head> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Machine learning libraries in JS typically use "Typed Arrays". Typed Arrays are | 
					
						
							|  |  |  | not JS Arrays! SheetJS expects bona fide JS Arrays. With some data wrangling, | 
					
						
							|  |  |  | translating between SheetJS worksheets and typed arrays is straightforward. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This demo covers conversions between worksheets and Typed Arrays for use with | 
					
						
							|  |  |  | [TensorFlow.js](https://js.tensorflow.org/js/) and other ML libraries. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | :::note | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The live code blocks in this demo load the standalone TensorFlow.js build: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ```html | 
					
						
							|  |  |  | <script src="https://unpkg.com/@tensorflow/tfjs@3.18.0/dist/tf.min.js"></script> | 
					
						
							|  |  |  | ``` | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ::: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ## CSV Data Interchange | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | `tf.data.csv` generates a Dataset from CSV data.  The function expects a URL. | 
					
						
							|  |  |  | Fortunately blob URLs are supported, making data import straightforward: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ```js | 
					
						
							|  |  |  | function worksheet_to_csv_url(worksheet) { | 
					
						
							|  |  |  |   /* generate CSV */ | 
					
						
							|  |  |  |   const csv = XLSX.utils.sheet_to_csv(worksheet); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   /* CSV -> Uint8Array -> Blob */ | 
					
						
							|  |  |  |   const u8 = new TextEncoder().encode(csv); | 
					
						
							|  |  |  |   const blob = new Blob([u8], { type: "text/csv" }); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   /* generate a blob URL */ | 
					
						
							|  |  |  |   return URL.createObjectURL(blob); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | ``` | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | [This demo mirrors TFjs docs](https://js.tensorflow.org/api/latest/#data.csv), | 
					
						
							| 
									
										
										
										
											2022-08-21 19:43:30 +00:00
										 |  |  | fetching [an XLSX export of the example dataset](https://sheetjs.com/data/bht.xlsx). | 
					
						
							| 
									
										
										
										
											2022-07-08 00:04:16 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | <details><summary><b>TF CSV Demo using XLSX files</b> (click to show)</summary> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ```jsx live | 
					
						
							|  |  |  | function SheetJSToTFJSCSV() { | 
					
						
							|  |  |  |   const [output, setOutput] = React.useState(""); | 
					
						
							|  |  |  |   const doit = React.useCallback(async () => { | 
					
						
							|  |  |  |     /* fetch file */ | 
					
						
							| 
									
										
										
										
											2022-08-21 19:43:30 +00:00
										 |  |  |     const f = await fetch("https://sheetjs.com/data/bht.xlsx"); | 
					
						
							| 
									
										
										
										
											2022-07-08 00:04:16 +00:00
										 |  |  |     const ab = await f.arrayBuffer(); | 
					
						
							|  |  |  |     /* parse file and get first worksheet */ | 
					
						
							|  |  |  |     const wb = XLSX.read(ab); | 
					
						
							|  |  |  |     const ws = wb.Sheets[wb.SheetNames[0]]; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /* generate CSV */ | 
					
						
							|  |  |  |     const csv = XLSX.utils.sheet_to_csv(ws); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /* generate blob URL */ | 
					
						
							|  |  |  |     const u8 = new TextEncoder().encode(csv); | 
					
						
							|  |  |  |     const blob = new Blob([u8], {type: "text/csv"}); | 
					
						
							|  |  |  |     const url = URL.createObjectURL(blob); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /* feed to tfjs */ | 
					
						
							|  |  |  |     const dataset = tf.data.csv(url, {columnConfigs:{"medv":{isLabel:true}}}); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /* this part mirrors the tf.data.csv docs */ | 
					
						
							|  |  |  |     const flat = dataset.map(({xs,ys}) => ({xs: Object.values(xs), ys: Object.values(ys)})).batch(10); | 
					
						
							|  |  |  |     const model = tf.sequential(); | 
					
						
							|  |  |  |     model.add(tf.layers.dense({inputShape: [(await dataset.columnNames()).length - 1], units: 1})); | 
					
						
							|  |  |  |     model.compile({ optimizer: tf.train.sgd(0.000001), loss: 'meanSquaredError' }); | 
					
						
							|  |  |  |     let base = output; | 
					
						
							|  |  |  |     await model.fitDataset(flat, { epochs: 10, callbacks: { onEpochEnd: async (epoch, logs) => { | 
					
						
							| 
									
										
										
										
											2022-07-25 20:48:10 +00:00
										 |  |  |       setOutput(base += "\n" + epoch + ":" + logs.loss); | 
					
						
							| 
									
										
										
										
											2022-07-08 00:04:16 +00:00
										 |  |  |     }}}); | 
					
						
							|  |  |  |     model.summary(); | 
					
						
							|  |  |  |   }); | 
					
						
							|  |  |  |   return ( <pre><b><a href="https://js.tensorflow.org/api/latest/#data.csv">Original CSV demo</a></b><br/><br/> | 
					
						
							|  |  |  |     <button onClick={doit}>Click to run</button> | 
					
						
							|  |  |  |     {output} | 
					
						
							|  |  |  |   </pre> ); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | ``` | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | </details> | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | In the other direction, `XLSX.read` will readily parse CSV exports. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ## JS Array Interchange | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | [The official Linear Regression tutorial](https://www.tensorflow.org/js/tutorials/training/linear_regression) | 
					
						
							|  |  |  | loads data from a JSON file: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ```json | 
					
						
							|  |  |  | [ | 
					
						
							|  |  |  |   { | 
					
						
							|  |  |  |     "Name": "chevrolet chevelle malibu", | 
					
						
							|  |  |  |     "Miles_per_Gallon": 18, | 
					
						
							|  |  |  |     "Cylinders": 8, | 
					
						
							|  |  |  |     "Displacement": 307, | 
					
						
							|  |  |  |     "Horsepower": 130, | 
					
						
							|  |  |  |     "Weight_in_lbs": 3504, | 
					
						
							|  |  |  |     "Acceleration": 12, | 
					
						
							|  |  |  |     "Year": "1970-01-01", | 
					
						
							|  |  |  |     "Origin": "USA" | 
					
						
							|  |  |  |   }, | 
					
						
							|  |  |  |   { | 
					
						
							|  |  |  |     "Name": "buick skylark 320", | 
					
						
							|  |  |  |     "Miles_per_Gallon": 15, | 
					
						
							|  |  |  |     "Cylinders": 8, | 
					
						
							|  |  |  |     "Displacement": 350, | 
					
						
							|  |  |  |     "Horsepower": 165, | 
					
						
							|  |  |  |     "Weight_in_lbs": 3693, | 
					
						
							|  |  |  |     "Acceleration": 11.5, | 
					
						
							|  |  |  |     "Year": "1970-01-01", | 
					
						
							|  |  |  |     "Origin": "USA" | 
					
						
							|  |  |  |   }, | 
					
						
							|  |  |  |   // ... | 
					
						
							|  |  |  | ] | 
					
						
							|  |  |  | ``` | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-08-21 19:43:30 +00:00
										 |  |  | In real use cases, data is stored in [spreadsheets](https://sheetjs.com/data/cd.xls) | 
					
						
							| 
									
										
										
										
											2022-07-08 00:04:16 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |  | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Following the tutorial, the data fetching method is easily adapted. Differences | 
					
						
							|  |  |  | from the official example are highlighted below: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ```js | 
					
						
							|  |  |  | /** | 
					
						
							|  |  |  |  * Get the car data reduced to just the variables we are interested | 
					
						
							|  |  |  |  * and cleaned of missing data. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | async function getData() { | 
					
						
							|  |  |  |   // highlight-start | 
					
						
							|  |  |  |   /* fetch file */ | 
					
						
							| 
									
										
										
										
											2022-08-21 19:43:30 +00:00
										 |  |  |   const carsDataResponse = await fetch('https://sheetjs.com/data/cd.xls'); | 
					
						
							| 
									
										
										
										
											2022-07-08 00:04:16 +00:00
										 |  |  |   /* get file data (ArrayBuffer) */ | 
					
						
							|  |  |  |   const carsDataAB = await carsDataResponse.arrayBuffer(); | 
					
						
							|  |  |  |   /* parse */ | 
					
						
							|  |  |  |   const carsDataWB = XLSX.read(carsDataAB); | 
					
						
							|  |  |  |   /* get first worksheet */ | 
					
						
							|  |  |  |   const carsDataWS = carsDataWB.Sheets[carsDataWB.SheetNames[0]]; | 
					
						
							|  |  |  |   /* generate array of JS objects */ | 
					
						
							|  |  |  |   const carsData = XLSX.utils.sheet_to_json(carsDataWS); | 
					
						
							|  |  |  |   // highlight-end | 
					
						
							|  |  |  |   const cleaned = carsData.map(car => ({ | 
					
						
							|  |  |  |     mpg: car.Miles_per_Gallon, | 
					
						
							|  |  |  |     horsepower: car.Horsepower, | 
					
						
							|  |  |  |   })) | 
					
						
							|  |  |  |   .filter(car => (car.mpg != null && car.horsepower != null)); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   return cleaned; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | ``` | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ## Low-Level Operations | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | :::caution | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | While it is more efficient to use low-level operations, JS or CSV interchange | 
					
						
							| 
									
										
										
										
											2022-07-25 20:48:10 +00:00
										 |  |  | is strongly recommended when possible. | 
					
						
							| 
									
										
										
										
											2022-07-08 00:04:16 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | ::: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ### Data Transposition | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A typical dataset in a spreadsheet will start with one header row and represent | 
					
						
							|  |  |  | each data record in its own row.  For example, the Iris dataset might look like | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | `XLSX.utils.sheet_to_json` will translate this into an array of row objects: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ```js | 
					
						
							|  |  |  | var aoo = [ | 
					
						
							|  |  |  |   {"sepal length": 5.1, "sepal width": 3.5, ...}, | 
					
						
							|  |  |  |   {"sepal length": 4.9, "sepal width":   3, ...}, | 
					
						
							|  |  |  |   ... | 
					
						
							|  |  |  | ]; | 
					
						
							|  |  |  | ``` | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | TF.js and other libraries tend to operate on individual columns, equivalent to: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ```js | 
					
						
							|  |  |  | var sepal_lengths = [5.1, 4.9, ...]; | 
					
						
							|  |  |  | var sepal_widths  = [3.5, 3, ...]; | 
					
						
							|  |  |  | ``` | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | When a 2D tensor can be exported, it will look different from the spreadsheet: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ```js | 
					
						
							|  |  |  | var data_set_2d = [ | 
					
						
							|  |  |  |   [5.1, 4.9, ...], | 
					
						
							|  |  |  |   [3.5, 3, ...], | 
					
						
							|  |  |  |   ... | 
					
						
							|  |  |  | ] | 
					
						
							|  |  |  | ``` | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This is the transpose of how people use spreadsheets! | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #### Typed Arrays and Columns | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A single typed array can be converted to a pure JS array with `Array.from`: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ```js | 
					
						
							|  |  |  | var column = Array.from(dataset_typedarray); | 
					
						
							|  |  |  | ``` | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Similarly, `Float32Array.from` generates a typed array from a normal array: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ```js | 
					
						
							|  |  |  | var dataset = Float32Array.from(column); | 
					
						
							|  |  |  | ``` | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ### Exporting Datasets to a Worksheet | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | `XLSX.utils.aoa_to_sheet` can generate a worksheet from an array of arrays. | 
					
						
							|  |  |  | ML libraries typically provide APIs to pull an array of arrays, but it will | 
					
						
							| 
									
										
										
										
											2022-08-19 06:42:18 +00:00
										 |  |  | be transposed. To export multiple data sets, manually "transpose" the data: | 
					
						
							| 
									
										
										
										
											2022-07-08 00:04:16 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | ```js | 
					
						
							|  |  |  | /* assuming data is an array of typed arrays */ | 
					
						
							|  |  |  | var aoa = []; | 
					
						
							|  |  |  | for(var i = 0; i < data.length; ++i) { | 
					
						
							|  |  |  |   for(var j = 0; j < data[i].length; ++j) { | 
					
						
							|  |  |  |     if(!aoa[j]) aoa[j] = []; | 
					
						
							|  |  |  |     aoa[j][i] = data[i][j]; | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | /* aoa can be directly converted to a worksheet object */ | 
					
						
							|  |  |  | var ws = XLSX.utils.aoa_to_sheet(aoa); | 
					
						
							|  |  |  | ``` | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ### Importing Data from a Spreadsheet | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | `sheet_to_json` with the option `header:1` will generate a row-major array of | 
					
						
							|  |  |  | arrays that can be transposed.  However, it is more efficient to walk the sheet | 
					
						
							|  |  |  | manually: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ```js | 
					
						
							|  |  |  | /* find worksheet range */ | 
					
						
							|  |  |  | var range = XLSX.utils.decode_range(ws['!ref']); | 
					
						
							|  |  |  | var out = [] | 
					
						
							|  |  |  | /* walk the columns */ | 
					
						
							|  |  |  | for(var C = range.s.c; C <= range.e.c; ++C) { | 
					
						
							|  |  |  |   /* create the typed array */ | 
					
						
							|  |  |  |   var ta = new Float32Array(range.e.r - range.s.r + 1); | 
					
						
							|  |  |  |   /* walk the rows */ | 
					
						
							|  |  |  |   for(var R = range.s.r; R <= range.e.r; ++R) { | 
					
						
							|  |  |  |     /* find the cell, skip it if the cell isn't numeric or boolean */ | 
					
						
							|  |  |  |     var cell = ws[XLSX.utils.encode_cell({r:R, c:C})]; | 
					
						
							|  |  |  |     if(!cell || cell.t != 'n' && cell.t != 'b') continue; | 
					
						
							|  |  |  |     /* assign to the typed array */ | 
					
						
							|  |  |  |     ta[R - range.s.r] = cell.v; | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  |   out.push(ta); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | ``` | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | If the data set has a header row, the loop can be adjusted to skip those rows. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ### TF.js Tensors | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A single `Array#map` can pull individual named fields from the result, which | 
					
						
							|  |  |  | can be used to construct TensorFlow.js tensor objects: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ```js | 
					
						
							|  |  |  | const aoo = XLSX.utils.sheet_to_json(worksheet); | 
					
						
							|  |  |  | const lengths = aoo.map(row => row["sepal length"]); | 
					
						
							|  |  |  | const tensor = tf.tensor1d(lengths); | 
					
						
							|  |  |  | ``` | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | `tf.Tensor` objects can be directly transposed using `transpose`: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ```js | 
					
						
							|  |  |  | var aoo = XLSX.utils.sheet_to_json(worksheet); | 
					
						
							|  |  |  | // "x" and "y" are the fields we want to pull from the data | 
					
						
							|  |  |  | var data = aoo.map(row => ([row["x"], row["y"]])); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // create a tensor representing two column datasets | 
					
						
							|  |  |  | var tensor = tf.tensor2d(data).transpose(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // individual columns can be accessed | 
					
						
							|  |  |  | var col1 = tensor.slice([0,0], [1,tensor.shape[1]]).flatten(); | 
					
						
							|  |  |  | var col2 = tensor.slice([1,0], [1,tensor.shape[1]]).flatten(); | 
					
						
							|  |  |  | ``` | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | For exporting, `stack` can be used to linearize the columns: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ```js | 
					
						
							|  |  |  | /* pull data into a Float32Array */ | 
					
						
							|  |  |  | var result = tf.stack([col1, col2]).transpose(); | 
					
						
							|  |  |  | var shape = tensor.shape; | 
					
						
							|  |  |  | var f32 = tensor.dataSync(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* construct an array of arrays of the data in spreadsheet order */ | 
					
						
							|  |  |  | var aoa = []; | 
					
						
							|  |  |  | for(var j = 0; j < shape[0]; ++j) { | 
					
						
							|  |  |  |   aoa[j] = []; | 
					
						
							|  |  |  |   for(var i = 0; i < shape[1]; ++i) aoa[j][i] = f32[j * shape[1] + i]; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* add headers to the top */ | 
					
						
							|  |  |  | aoa.unshift(["x", "y"]); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* generate worksheet */ | 
					
						
							|  |  |  | var worksheet = XLSX.utils.aoa_to_sheet(aoa); | 
					
						
							|  |  |  | ``` | 
					
						
							|  |  |  | 
 |