forked from sheetjs/sheetjs
		
	dta initial
This commit is contained in:
		
							parent
							
								
									cd5fafda32
								
							
						
					
					
						commit
						9199c2600c
					
				
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
								
								
								
								
								
									
									
								
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -32,6 +32,7 @@ tmp | ||||
| *.[eE][tT][hH] | ||||
| *.[nN][uU][mM][bB][eE][rR][sS] | ||||
| *.[mM][oO][dD] | ||||
| *.[dD][tT][aA] | ||||
| *.123 | ||||
| *.htm | ||||
| *.html | ||||
|  | ||||
| @ -173,8 +173,20 @@ function SSF_frac(x/*:number*/, D/*:number*/, mixed/*:?boolean*/)/*:Array<number | ||||
| 	var q = Math.floor(sgn * P/Q); | ||||
| 	return [q, sgn*P - q*Q, Q]; | ||||
| } | ||||
| function SSF_normalize_xl_unsafe(v/*:number*/)/*:number*/ { | ||||
| 	var s = v.toPrecision(16); | ||||
| 	if(s.indexOf("e") > -1) { | ||||
| 		var m = s.slice(0, s.indexOf("e")); | ||||
| 		m = m.indexOf(".") > -1 ? m.slice(0, (m.slice(0,2) == "0." ? 17 : 16)) : (m.slice(0,15) + fill("0", m.length - 15)); | ||||
| 		return m + s.slice(s.indexOf("e")); | ||||
| 	} | ||||
| 	var n = s.indexOf(".") > -1 ? s.slice(0, (s.slice(0,2) == "0." ? 17 : 16)) : (s.slice(0,15) + fill("0", s.length - 15)); | ||||
| 	return Number(n); | ||||
| } | ||||
| 
 | ||||
| function SSF_parse_date_code(v/*:number*/,opts/*:?any*/,b2/*:?boolean*/) { | ||||
| 	if(v > 2958465 || v < 0) return null; | ||||
| 	v = SSF_normalize_xl_unsafe(v); | ||||
| 	var date = (v|0), time = Math.floor(86400 * (v - date)), dow=0; | ||||
| 	var dout=[]; | ||||
| 	var out={D:date, T:time, u:86400*(v-date)-time,y:0,m:0,d:0,H:0,M:0,S:0,q:0}; | ||||
| @ -328,7 +340,7 @@ function SSF_write_date(type/*:number*/, fmt/*:string*/, val, ss0/*:?number*/)/* | ||||
| 		switch(fmt) { | ||||
| 			case '[h]': case '[hh]': out = val.D*24+val.H; break; | ||||
| 			case '[m]': case '[mm]': out = (val.D*24+val.H)*60+val.M; break; | ||||
| 			case '[s]': case '[ss]': out = ((val.D*24+val.H)*60+val.M)*60+Math.round(val.S+val.u); break; | ||||
| 			case '[s]': case '[ss]': out = ((val.D*24+val.H)*60+val.M)*60+(ss0 == 0 ? Math.round(val.S+val.u) : val.S); break; | ||||
| 			default: throw 'bad abstime format: ' + fmt; | ||||
| 		} outl = fmt.length === 3 ? 1 : 2; break; | ||||
| 		case 101: /* 'e' era */ | ||||
| @ -776,10 +788,11 @@ function eval_fmt(fmt/*:string*/, v/*:any*/, opts/*:any*/, flen/*:number*/) { | ||||
| 		switch(out[i].t) { | ||||
| 			case 'h': case 'H': out[i].t = hr; lst='h'; if(bt < 1) bt = 1; break; | ||||
| 			case 's': | ||||
| 				if((ssm=out[i].v.match(/\.0+$/))) ss0=Math.max(ss0,ssm[0].length-1); | ||||
| 				if((ssm=out[i].v.match(/\.0+$/))) { ss0=Math.max(ss0,ssm[0].length-1); bt = 4;} | ||||
| 				if(bt < 3) bt = 3; | ||||
| 			/* falls through */ | ||||
| 			case 'd': case 'y': case 'M': case 'e': lst=out[i].t; break; | ||||
| 			case 'd': case 'y': case 'e': lst=out[i].t; break; | ||||
| 			case 'M': lst=out[i].t; if(bt < 2) bt = 2; break; | ||||
| 			case 'm': if(lst === 's') { out[i].t = 'M'; if(bt < 2) bt = 2; } break; | ||||
| 			case 'X': /*if(out[i].v === "B2");*/ | ||||
| 				break; | ||||
| @ -789,19 +802,29 @@ function eval_fmt(fmt/*:string*/, v/*:any*/, opts/*:any*/, flen/*:number*/) { | ||||
| 				if(bt < 3 && out[i].v.match(/[Ss]/)) bt = 3; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	/* time rounding depends on presence of minute / second / usec fields */ | ||||
| 	var _dt; | ||||
| 	switch(bt) { | ||||
| 		case 0: break; | ||||
| 		case 1: | ||||
| 			/*::if(!dt) break;*/ | ||||
| 		case 2: | ||||
| 		case 3: | ||||
| 			if(dt.u >= 0.5) { dt.u = 0; ++dt.S; } | ||||
| 			if(dt.S >=  60) { dt.S = 0; ++dt.M; } | ||||
| 			if(dt.M >=  60) { dt.M = 0; ++dt.H; } | ||||
| 			if(dt.H >=  24) { dt.H = 0; ++dt.D; _dt = SSF_parse_date_code(dt.D); _dt.u = dt.u; _dt.S = dt.S; _dt.M = dt.M; _dt.H = dt.H; dt = _dt; } | ||||
| 			break; | ||||
| 		case 2: | ||||
| 			/*::if(!dt) break;*/ | ||||
| 			if(dt.u >= 0.5) { dt.u = 0; ++dt.S; } | ||||
| 		case 4: | ||||
| 			switch(ss0) { | ||||
| 				case 1: dt.u = Math.round(dt.u * 10)/10; break; | ||||
| 				case 2: dt.u = Math.round(dt.u * 100)/100; break; | ||||
| 				case 3: dt.u = Math.round(dt.u * 1000)/1000; break; | ||||
| 			} | ||||
| 			if(dt.u >=   1) { dt.u = 0; ++dt.S; } | ||||
| 			if(dt.S >=  60) { dt.S = 0; ++dt.M; } | ||||
| 			if(dt.M >=  60) { dt.M = 0; ++dt.H; } | ||||
| 			if(dt.H >=  24) { dt.H = 0; ++dt.D; _dt = SSF_parse_date_code(dt.D); _dt.u = dt.u; _dt.S = dt.S; _dt.M = dt.M; _dt.H = dt.H; dt = _dt; } | ||||
| 			break; | ||||
| 	} | ||||
| 
 | ||||
|  | ||||
							
								
								
									
										26
									
								
								packages/dta/.eslintrc
									
									
									
									
									
										Normal file
									
								
							
							
								
								
								
								
								
									
									
								
							
						
						
									
										26
									
								
								packages/dta/.eslintrc
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,26 @@ | ||||
| { | ||||
| 	"env": { "shared-node-browser":true }, | ||||
| 	"globals": {}, | ||||
| 	"parserOptions": { | ||||
| 		"ecmaVersion": 6 | ||||
| 	}, | ||||
| 	"plugins": [ "html", "json" ], | ||||
| 	"extends": "eslint:recommended", | ||||
| 	"rules": { | ||||
| 		"comma-style": [ 2, "last" ], | ||||
| 		"comma-dangle": [ 2, "never" ], | ||||
| 		"curly": 0, | ||||
| 		"no-bitwise": 0, | ||||
| 		"no-cond-assign": 1, | ||||
| 		"no-console": 0, | ||||
| 		"no-control-regex": 0, | ||||
| 		"no-unused-vars": 1, | ||||
| 		"no-empty": 0, | ||||
| 		"no-trailing-spaces": 2, | ||||
| 		"no-use-before-define": [ 1, { | ||||
| 			"functions":false, "classes":true, "variables":false | ||||
| 		}], | ||||
| 		"no-useless-escape": 0, | ||||
| 		"semi": [ 2, "always" ] | ||||
| 	} | ||||
| } | ||||
							
								
								
									
										22
									
								
								packages/dta/Makefile
									
									
									
									
									
										Normal file
									
								
							
							
								
								
								
								
								
									
									
								
							
						
						
									
										22
									
								
								packages/dta/Makefile
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,22 @@ | ||||
| .PHONY: build | ||||
| build: node browser | ||||
| 
 | ||||
| ## NodeJS target
 | ||||
| 
 | ||||
| .PHONY: node | ||||
| node: dist/dta.js | ||||
| 
 | ||||
| dist/dta.js: dta.ts | ||||
| 	npx esbuild@0.14.14 dta.ts --bundle --outdir=dist --platform=node | ||||
| 
 | ||||
| .PHONY: test-node | ||||
| test-node: dist/dta.js test.js | ||||
| 	npx mocha@2.5.3 test.js | ||||
| 
 | ||||
| ## Browser target
 | ||||
| .PHONY: browser | ||||
| browser: dist/dta.min.js | ||||
| 
 | ||||
| dist/dta.min.js: dta.ts | ||||
| 	npx esbuild@0.14.14 dta.ts --bundle --outfile=dist/dta.min.js --minify --sourcemap --global-name=DTA | ||||
| 
 | ||||
							
								
								
									
										9
									
								
								packages/dta/README.md
									
									
									
									
									
										Normal file
									
								
							
							
								
								
								
								
								
									
									
								
							
						
						
									
										9
									
								
								packages/dta/README.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,9 @@ | ||||
| # DTA Data File Codec | ||||
| 
 | ||||
| Codec for reading Stata .DTA files and generating CSF workbook objects | ||||
| compatible with the [SheetJS](https://sheetjs.com) library constellation. | ||||
| 
 | ||||
| DTA datasets can support millions of observations and over 32767 variables. | ||||
| The codec will truncate data to 1048576 observations and 16384 variables. | ||||
| 
 | ||||
| <https://docs.sheetjs.com/docs/constellation/dta> includes a live demo. | ||||
							
								
								
									
										19
									
								
								packages/dta/bin/dta2csv.njs
									
									
									
									
									
										Executable file
									
								
							
							
								
								
								
								
								
									
									
								
							
						
						
									
										19
									
								
								packages/dta/bin/dta2csv.njs
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,19 @@ | ||||
| #!/usr/bin/env node | ||||
| /* eslint-env node, es6 */ | ||||
| const DTA = require("../"); | ||||
| const XLSX = (() => { | ||||
|   try { | ||||
|     const XLSX = require("xlsx"); | ||||
|     DTA.set_utils(XLSX.utils); | ||||
|     return XLSX; | ||||
|   } catch(e) { | ||||
|     throw new Error("Must install the SheetJS file processing library! See https://docs.sheetjs.com/docs/getting-started/installation/nodejs for more details"); | ||||
|   } | ||||
| })(); | ||||
| const fs = require("fs"); | ||||
| 
 | ||||
| const buf = fs.readFileSync(process.argv[2]); | ||||
| const wb = DTA.parse(buf); | ||||
| // translate stub cells to single blanks | ||||
| wb.Sheets[wb.SheetNames[0]]["!data"].forEach(row => row.forEach(cell => {if(cell.t == "z") {cell.t = "s"; cell.v = " ";}})); | ||||
| console.log(XLSX.utils.sheet_to_csv(wb.Sheets[wb.SheetNames[0]])); | ||||
							
								
								
									
										542
									
								
								packages/dta/dist/dta.js
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
								
								
								
								
								
									
									
								
							
						
						
									
										542
									
								
								packages/dta/dist/dta.js
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @ -0,0 +1,542 @@ | ||||
| var __defProp = Object.defineProperty; | ||||
| var __getOwnPropDesc = Object.getOwnPropertyDescriptor; | ||||
| var __getOwnPropNames = Object.getOwnPropertyNames; | ||||
| var __hasOwnProp = Object.prototype.hasOwnProperty; | ||||
| var __markAsModule = (target) => __defProp(target, "__esModule", { value: true }); | ||||
| var __export = (target, all) => { | ||||
|   for (var name in all) | ||||
|     __defProp(target, name, { get: all[name], enumerable: true }); | ||||
| }; | ||||
| var __reExport = (target, module2, copyDefault, desc) => { | ||||
|   if (module2 && typeof module2 === "object" || typeof module2 === "function") { | ||||
|     for (let key of __getOwnPropNames(module2)) | ||||
|       if (!__hasOwnProp.call(target, key) && (copyDefault || key !== "default")) | ||||
|         __defProp(target, key, { get: () => module2[key], enumerable: !(desc = __getOwnPropDesc(module2, key)) || desc.enumerable }); | ||||
|   } | ||||
|   return target; | ||||
| }; | ||||
| var __toCommonJS = /* @__PURE__ */ ((cache) => { | ||||
|   return (module2, temp) => { | ||||
|     return cache && cache.get(module2) || (temp = __reExport(__markAsModule({}), module2, 1), cache && cache.set(module2, temp), temp); | ||||
|   }; | ||||
| })(typeof WeakMap !== "undefined" ? /* @__PURE__ */ new WeakMap() : 0); | ||||
| 
 | ||||
| // dta.ts
 | ||||
| var dta_exports = {}; | ||||
| __export(dta_exports, { | ||||
|   parse: () => parse, | ||||
|   set_utils: () => set_utils | ||||
| }); | ||||
| var _utils; | ||||
| function set_utils(utils) { | ||||
|   _utils = utils; | ||||
| } | ||||
| function u8_to_dataview(array) { | ||||
|   return new DataView(array.buffer, array.byteOffset, array.byteLength); | ||||
| } | ||||
| function valid_inc(p, n) { | ||||
|   if (p.str.slice(p.ptr, p.ptr + n.length) != n) | ||||
|     return false; | ||||
|   p.ptr += n.length; | ||||
|   return true; | ||||
| } | ||||
| function skip_end(p, n) { | ||||
|   const idx = p.str.indexOf(n, p.ptr); | ||||
|   if (idx == -1) | ||||
|     throw new Error(`Expected ${n} after offset ${p.ptr}`); | ||||
|   p.ptr = idx + n.length; | ||||
| } | ||||
| function slice_end(p, n) { | ||||
|   const idx = p.str.indexOf(n, p.ptr); | ||||
|   if (idx == -1) | ||||
|     throw new Error(`Expected ${n} after offset ${p.ptr}`); | ||||
|   const raw = p.raw.slice(p.ptr, idx); | ||||
|   const res = { | ||||
|     ptr: 0, | ||||
|     raw, | ||||
|     str: p.str.slice(p.ptr, idx), | ||||
|     dv: u8_to_dataview(raw) | ||||
|   }; | ||||
|   p.ptr = idx + n.length; | ||||
|   return res; | ||||
| } | ||||
| function read_f64(p, LE) { | ||||
|   p.ptr += 8; | ||||
|   const d = p.dv.getFloat64(p.ptr - 8, LE); | ||||
|   return d > 8988e304 ? null : d; | ||||
| } | ||||
| function read_f32(p, LE) { | ||||
|   p.ptr += 4; | ||||
|   const d = p.dv.getFloat32(p.ptr - 4, LE); | ||||
|   return d > 1701e35 ? null : d; | ||||
| } | ||||
| function read_u32(p, LE) { | ||||
|   p.ptr += 4; | ||||
|   return p.dv.getUint32(p.ptr - 4, LE); | ||||
| } | ||||
| function read_i32(p, LE) { | ||||
|   p.ptr += 4; | ||||
|   const u = p.dv.getInt32(p.ptr - 4, LE); | ||||
|   return u > 2147483620 ? null : u; | ||||
| } | ||||
| function read_u16(p, LE) { | ||||
|   p.ptr += 2; | ||||
|   return p.dv.getUint16(p.ptr - 2, LE); | ||||
| } | ||||
| function read_i16(p, LE) { | ||||
|   p.ptr += 2; | ||||
|   const u = p.dv.getInt16(p.ptr - 2, LE); | ||||
|   return u > 32740 ? null : u; | ||||
| } | ||||
| function read_u8(p) { | ||||
|   return p.raw[p.ptr++]; | ||||
| } | ||||
| function read_i8(p) { | ||||
|   let u = p.raw[p.ptr++]; | ||||
|   u = u < 128 ? u : u - 256; | ||||
|   return u > 100 ? null : u; | ||||
| } | ||||
| var SUPPORTED_VERSIONS_TAGGED = [ | ||||
|   "117", | ||||
|   "118" | ||||
| ]; | ||||
| function parse_tagged(raw) { | ||||
|   const err = "Not a DTA file"; | ||||
|   const str = new TextDecoder("latin1").decode(raw); | ||||
|   const d = { | ||||
|     ptr: 0, | ||||
|     raw, | ||||
|     str, | ||||
|     dv: u8_to_dataview(raw) | ||||
|   }; | ||||
|   let vers = 118; | ||||
|   let LE = true; | ||||
|   let nvar = 0, nobs = 0, nobs_lo = 0, nobs_hi = 0; | ||||
|   let label = "", timestamp = ""; | ||||
|   const var_types = []; | ||||
|   const var_names = []; | ||||
|   const formats = []; | ||||
|   if (!valid_inc(d, "<stata_dta>")) | ||||
|     throw err; | ||||
|   { | ||||
|     if (!valid_inc(d, "<header>")) | ||||
|       throw err; | ||||
|     { | ||||
|       if (!valid_inc(d, "<release>")) | ||||
|         throw err; | ||||
|       const res = slice_end(d, "</release>"); | ||||
|       if (SUPPORTED_VERSIONS_TAGGED.indexOf(res.str) == -1) | ||||
|         throw `Unsupported DTA ${res.str} file`; | ||||
|       vers = +res.str; | ||||
|     } | ||||
|     { | ||||
|       if (!valid_inc(d, "<byteorder>")) | ||||
|         throw err; | ||||
|       const res = slice_end(d, "</byteorder>"); | ||||
|       switch (res.str) { | ||||
|         case "MSF": | ||||
|           LE = false; | ||||
|           break; | ||||
|         case "LSF": | ||||
|           LE = true; | ||||
|           break; | ||||
|         default: | ||||
|           throw `Unsupported byteorder ${res.str}`; | ||||
|       } | ||||
|     } | ||||
|     { | ||||
|       if (!valid_inc(d, "<K>")) | ||||
|         throw err; | ||||
|       const res = slice_end(d, "</K>"); | ||||
|       nvar = read_u16(res, LE); | ||||
|     } | ||||
|     { | ||||
|       if (!valid_inc(d, "<N>")) | ||||
|         throw err; | ||||
|       const res = slice_end(d, "</N>"); | ||||
|       if (vers == 117) | ||||
|         nobs = nobs_lo = read_u32(res, LE); | ||||
|       else { | ||||
|         const lo = read_u32(res, LE), hi = read_u32(res, LE); | ||||
|         nobs = LE ? (nobs_lo = lo) + (nobs_hi = hi) * Math.pow(2, 32) : (nobs_lo = hi) + (nobs_hi = lo) * Math.pow(2, 32); | ||||
|       } | ||||
|       if (nobs > 1e6) | ||||
|         console.error(`More than 1 million observations -- extra rows will be dropped`); | ||||
|     } | ||||
|     { | ||||
|       if (!valid_inc(d, "<label>")) | ||||
|         throw err; | ||||
|       const res = slice_end(d, "</label>"); | ||||
|       const w = vers >= 118 ? 2 : 1; | ||||
|       const strlen = w == 1 ? read_u8(res) : read_u16(res, LE); | ||||
|       if (strlen + w != res.str.length) | ||||
|         throw `Expected string length ${strlen} but actual length was ${res.str.length - w}`; | ||||
|       if (strlen > 0) | ||||
|         label = new TextDecoder().decode(res.raw.slice(w)); | ||||
|     } | ||||
|     { | ||||
|       if (!valid_inc(d, "<timestamp>")) | ||||
|         throw err; | ||||
|       const res = slice_end(d, "</timestamp>"); | ||||
|       const strlen = read_u8(res); | ||||
|       if (strlen + 1 != res.str.length) | ||||
|         throw `Expected string length ${strlen} but actual length was ${res.str.length - 1}`; | ||||
|       if (strlen > 0) | ||||
|         timestamp = res.str.slice(1); | ||||
|     } | ||||
|     if (!valid_inc(d, "</header>")) | ||||
|       throw err; | ||||
|   } | ||||
|   { | ||||
|     if (!valid_inc(d, "<map>")) | ||||
|       throw err; | ||||
|     skip_end(d, "</map>"); | ||||
|   } | ||||
|   let stride = 0; | ||||
|   { | ||||
|     if (!valid_inc(d, "<variable_types>")) | ||||
|       throw err; | ||||
|     const res = slice_end(d, "</variable_types>"); | ||||
|     if (res.raw.length != 2 * nvar) | ||||
|       throw `Expected variable_types length ${nvar * 2}, found ${res.raw.length}`; | ||||
|     while (res.ptr < res.raw.length) { | ||||
|       const type = read_u16(res, LE); | ||||
|       var_types.push(type); | ||||
|       if (type >= 1 && type <= 2045) | ||||
|         stride += type; | ||||
|       else | ||||
|         switch (type) { | ||||
|           case 32768: | ||||
|             stride += 8; | ||||
|             break; | ||||
|           case 65526: | ||||
|             stride += 8; | ||||
|             break; | ||||
|           case 65527: | ||||
|             stride += 4; | ||||
|             break; | ||||
|           case 65528: | ||||
|             stride += 4; | ||||
|             break; | ||||
|           case 65529: | ||||
|             stride += 2; | ||||
|             break; | ||||
|           case 65530: | ||||
|             stride += 1; | ||||
|             break; | ||||
|           default: | ||||
|             throw `Unsupported field type ${type}`; | ||||
|         } | ||||
|     } | ||||
|   } | ||||
|   { | ||||
|     if (!valid_inc(d, "<varnames>")) | ||||
|       throw err; | ||||
|     const res = slice_end(d, "</varnames>"); | ||||
|     const w = vers >= 118 ? 129 : 33; | ||||
|     if (res.raw.length != w * nvar) | ||||
|       throw `Expected variable_types length ${nvar * w}, found ${res.raw.length}`; | ||||
|     while (res.ptr < res.raw.length) { | ||||
|       const name = new TextDecoder().decode(res.raw.slice(res.ptr, res.ptr + w)); | ||||
|       res.ptr += w; | ||||
|       var_names.push(name.replace(/\x00[\s\S]*/, "")); | ||||
|     } | ||||
|   } | ||||
|   { | ||||
|     if (!valid_inc(d, "<sortlist>")) | ||||
|       throw err; | ||||
|     const res = slice_end(d, "</sortlist>"); | ||||
|     if (res.raw.length != 2 * nvar + 2) | ||||
|       throw `Expected sortlist length ${nvar * 2 + 2}, found ${res.raw.length}`; | ||||
|   } | ||||
|   { | ||||
|     if (!valid_inc(d, "<formats>")) | ||||
|       throw err; | ||||
|     const res = slice_end(d, "</formats>"); | ||||
|     const w = vers >= 118 ? 57 : 49; | ||||
|     if (res.raw.length != w * nvar) | ||||
|       throw `Expected formats length ${nvar * w}, found ${res.raw.length}`; | ||||
|     while (res.ptr < res.raw.length) { | ||||
|       const name = new TextDecoder().decode(res.raw.slice(res.ptr, res.ptr + w)); | ||||
|       res.ptr += w; | ||||
|       formats.push(name.replace(/\x00[\s\S]*/, "")); | ||||
|     } | ||||
|   } | ||||
|   { | ||||
|     if (!valid_inc(d, "<value_label_names>")) | ||||
|       throw err; | ||||
|     const w = vers >= 118 ? 129 : 33; | ||||
|     const res = slice_end(d, "</value_label_names>"); | ||||
|   } | ||||
|   { | ||||
|     if (!valid_inc(d, "<variable_labels>")) | ||||
|       throw err; | ||||
|     const w = vers >= 118 ? 321 : 81; | ||||
|     const res = slice_end(d, "</variable_labels>"); | ||||
|   } | ||||
|   { | ||||
|     if (!valid_inc(d, "<characteristics>")) | ||||
|       throw err; | ||||
|     while (d.str.slice(d.ptr, d.ptr + 4) == "<ch>") { | ||||
|       d.ptr += 4; | ||||
|       const len = read_u32(d, LE); | ||||
|       d.ptr += len; | ||||
|       if (!valid_inc(d, "</ch>")) | ||||
|         throw err; | ||||
|     } | ||||
|     if (!valid_inc(d, "</characteristics>")) | ||||
|       throw err; | ||||
|   } | ||||
|   const ws = _utils.aoa_to_sheet([var_names], { dense: true }); | ||||
|   var ptrs = []; | ||||
|   { | ||||
|     if (!valid_inc(d, "<data>")) | ||||
|       throw err; | ||||
|     for (let R = 0; R < nobs; ++R) { | ||||
|       const row = []; | ||||
|       for (let C = 0; C < nvar; ++C) { | ||||
|         let t = var_types[C]; | ||||
|         if (t >= 1 && t <= 2045) { | ||||
|           let s = new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + t)); | ||||
|           s = s.replace(/\x00[\s\S]*/, ""); | ||||
|           row[C] = s; | ||||
|           d.ptr += t; | ||||
|         } else | ||||
|           switch (t) { | ||||
|             case 65526: | ||||
|               row[C] = read_f64(d, LE); | ||||
|               break; | ||||
|             case 65527: | ||||
|               row[C] = read_f32(d, LE); | ||||
|               break; | ||||
|             case 65528: | ||||
|               row[C] = read_i32(d, LE); | ||||
|               break; | ||||
|             case 65529: | ||||
|               row[C] = read_i16(d, LE); | ||||
|               break; | ||||
|             case 65530: | ||||
|               row[C] = read_i8(d); | ||||
|               break; | ||||
|             case 32768: | ||||
|               { | ||||
|                 row[C] = "##SheetJStrL##"; | ||||
|                 ptrs.push([R + 1, C, d.raw.slice(d.ptr, d.ptr + 8)]); | ||||
|                 d.ptr += 8; | ||||
|               } | ||||
|               break; | ||||
|             default: | ||||
|               throw `Unsupported field type ${t} for ${var_names[C]}`; | ||||
|           } | ||||
|       } | ||||
|       _utils.sheet_add_aoa(ws, [row], { origin: -1, sheetStubs: true }); | ||||
|     } | ||||
|     if (!valid_inc(d, "</data>")) | ||||
|       throw err; | ||||
|   } | ||||
|   { | ||||
|     if (!valid_inc(d, "<strls>")) | ||||
|       throw err; | ||||
|     const strl_tbl = []; | ||||
|     while (d.raw[d.ptr] == 71) { | ||||
|       if (!valid_inc(d, "GSO")) | ||||
|         throw err; | ||||
|       const v = read_u32(d, LE); | ||||
|       let o = 0; | ||||
|       if (vers == 117) | ||||
|         o = read_u32(d, LE); | ||||
|       else { | ||||
|         const lo = read_u32(d, LE), hi = read_u32(d, LE); | ||||
|         o = LE ? lo + hi * Math.pow(2, 32) : hi + lo * Math.pow(2, 32); | ||||
|         if (o > 1e6) | ||||
|           console.error(`More than 1 million observations -- data will be dropped`); | ||||
|       } | ||||
|       const t = read_u8(d); | ||||
|       const len = read_u32(d, LE); | ||||
|       if (!strl_tbl[o]) | ||||
|         strl_tbl[o] = []; | ||||
|       let str2 = ""; | ||||
|       if (t == 129) { | ||||
|         str2 = new TextDecoder("latin1").decode(d.raw.slice(d.ptr, d.ptr + len)); | ||||
|         d.ptr += len; | ||||
|       } else { | ||||
|         str2 = new TextDecoder("latin1").decode(d.raw.slice(d.ptr, d.ptr + len)).replace(/\x00$/, ""); | ||||
|         d.ptr += len; | ||||
|       } | ||||
|       strl_tbl[o][v] = str2; | ||||
|     } | ||||
|     if (!valid_inc(d, "</strls>")) | ||||
|       throw err; | ||||
|     ptrs.forEach(([R, C, buf]) => { | ||||
|       const dv = u8_to_dataview(buf); | ||||
|       let v = 0, o = 0; | ||||
|       switch (vers) { | ||||
|         case 117: | ||||
|           { | ||||
|             v = dv.getUint32(0, LE); | ||||
|             o = dv.getUint32(4, LE); | ||||
|           } | ||||
|           break; | ||||
|         case 118: | ||||
|         case 120: | ||||
|           { | ||||
|             v = dv.getUint16(0, LE); | ||||
|             const o1 = dv.getUint16(2, LE), o2 = dv.getUint32(4, LE); | ||||
|             o = LE ? o1 + o2 * 65536 : o2 + o1 * 2 ** 32; | ||||
|           } | ||||
|           break; | ||||
|         case 119: | ||||
|         case 121: { | ||||
|           const v1 = dv.getUint16(0, LE), v2 = buf[2]; | ||||
|           v = LE ? v1 + (v2 << 16) : v2 + (v1 << 8); | ||||
|           const o1 = buf[3], o2 = dv.getUint32(4, LE); | ||||
|           o = LE ? o1 + o2 * 256 : o2 + o1 * 2 ** 32; | ||||
|         } | ||||
|       } | ||||
|       ws["!data"][R][C].v = strl_tbl[o][v]; | ||||
|     }); | ||||
|   } | ||||
|   { | ||||
|     if (!valid_inc(d, "<value_labels>")) | ||||
|       throw err; | ||||
|     const res = slice_end(d, "</value_labels>"); | ||||
|   } | ||||
|   if (!valid_inc(d, "</stata_dta>")) | ||||
|     throw err; | ||||
|   const wb = _utils.book_new(); | ||||
|   _utils.book_append_sheet(wb, ws, "Sheet1"); | ||||
|   return wb; | ||||
| } | ||||
| function parse_legacy(raw) { | ||||
|   let vers = raw[0]; | ||||
|   switch (vers) { | ||||
|     case 102: | ||||
|     case 112: | ||||
|       throw `Unsupported DTA ${vers} file`; | ||||
|     case 103: | ||||
|     case 104: | ||||
|     case 105: | ||||
|     case 108: | ||||
|     case 110: | ||||
|     case 111: | ||||
|     case 113: | ||||
|     case 114: | ||||
|     case 115: | ||||
|       break; | ||||
|     default: | ||||
|       throw new Error("Not a DTA file"); | ||||
|   } | ||||
|   const d = { | ||||
|     ptr: 1, | ||||
|     raw, | ||||
|     str: "", | ||||
|     dv: u8_to_dataview(raw) | ||||
|   }; | ||||
|   let LE = true; | ||||
|   let nvar = 0, nobs = 0; | ||||
|   let label = "", timestamp = ""; | ||||
|   const var_types = []; | ||||
|   const var_names = []; | ||||
|   const formats = []; | ||||
|   { | ||||
|     const byteorder = read_u8(d); | ||||
|     switch (byteorder) { | ||||
|       case 1: | ||||
|         LE = false; | ||||
|         break; | ||||
|       case 2: | ||||
|         LE = true; | ||||
|         break; | ||||
|       default: | ||||
|         throw `DTA ${vers} Unexpected byteorder ${byteorder}`; | ||||
|     } | ||||
|     let byte = read_u8(d); | ||||
|     if (byte != 1) | ||||
|       throw `DTA ${vers} Unexpected filetype ${byte}`; | ||||
|     d.ptr++; | ||||
|     nvar = read_u16(d, LE); | ||||
|     nobs = read_u32(d, LE); | ||||
|     d.ptr += vers >= 108 ? 81 : 32; | ||||
|     if (vers >= 105) | ||||
|       d.ptr += 18; | ||||
|   } | ||||
|   { | ||||
|     let C = 0; | ||||
|     for (C = 0; C < nvar; ++C) | ||||
|       var_types.push(read_u8(d)); | ||||
|     const w = vers >= 110 ? 33 : 9; | ||||
|     for (C = 0; C < nvar; ++C) { | ||||
|       var_names.push(new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + w)).replace(/\x00[\s\S]*$/, "")); | ||||
|       d.ptr += w; | ||||
|     } | ||||
|     d.ptr += 2 * (nvar + 1); | ||||
|     const fw = vers >= 114 ? 49 : vers >= 105 ? 12 : 7; | ||||
|     for (C = 0; C < nvar; ++C) { | ||||
|       formats.push(new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + fw)).replace(/\x00[\s\S]*$/, "")); | ||||
|       d.ptr += fw; | ||||
|     } | ||||
|     d.ptr += (vers >= 110 ? 33 : 9) * nvar; | ||||
|   } | ||||
|   d.ptr += (vers >= 106 ? 81 : 32) * nvar; | ||||
|   if (vers >= 105) | ||||
|     while (d.ptr < d.raw.length) { | ||||
|       const dt = read_u8(d), len = (vers >= 111 ? read_u32 : read_u16)(d, LE); | ||||
|       if (dt == 0 && len == 0) | ||||
|         break; | ||||
|       d.ptr += len; | ||||
|     } | ||||
|   const ws = _utils.aoa_to_sheet([var_names], { dense: true }); | ||||
|   for (let R = 0; R < nobs; ++R) { | ||||
|     const row = []; | ||||
|     for (let C = 0; C < nvar; ++C) { | ||||
|       let t = var_types[C]; | ||||
|       if (vers >= 111 && t >= 1 && t <= 244) { | ||||
|         let s = new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + t)); | ||||
|         s = s.replace(/\x00[\s\S]*/, ""); | ||||
|         row[C] = s; | ||||
|         d.ptr += t; | ||||
|       } else | ||||
|         switch (t) { | ||||
|           case 251: | ||||
|           case 98: | ||||
|             row[C] = read_i8(d); | ||||
|             break; | ||||
|           case 252: | ||||
|           case 105: | ||||
|             row[C] = read_i16(d, LE); | ||||
|             break; | ||||
|           case 253: | ||||
|           case 108: | ||||
|             row[C] = read_i32(d, LE); | ||||
|             break; | ||||
|           case 254: | ||||
|           case 102: | ||||
|             row[C] = read_f32(d, LE); | ||||
|             break; | ||||
|           case 255: | ||||
|           case 100: | ||||
|             row[C] = read_f64(d, LE); | ||||
|             break; | ||||
|           default: | ||||
|             throw `Unsupported field type ${t} for ${var_names[C]}`; | ||||
|         } | ||||
|     } | ||||
|     _utils.sheet_add_aoa(ws, [row], { origin: -1, sheetStubs: true }); | ||||
|   } | ||||
|   const wb = _utils.book_new(); | ||||
|   _utils.book_append_sheet(wb, ws, "Sheet1"); | ||||
|   return wb; | ||||
| } | ||||
| function parse(data) { | ||||
|   if (data[0] >= 102 && data[0] <= 115) | ||||
|     return parse_legacy(data); | ||||
|   if (data[0] === 60) | ||||
|     return parse_tagged(data); | ||||
|   throw new Error("Not a DTA file"); | ||||
| } | ||||
| module.exports = __toCommonJS(dta_exports); | ||||
| // Annotate the CommonJS export names for ESM import in node:
 | ||||
| 0 && (module.exports = { | ||||
|   parse, | ||||
|   set_utils | ||||
| }); | ||||
							
								
								
									
										2
									
								
								packages/dta/dist/dta.min.js
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
								
								
								
								
								
									
									
								
							
						
						
									
										2
									
								
								packages/dta/dist/dta.min.js
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										543
									
								
								packages/dta/dta.ts
									
									
									
									
									
										Normal file
									
								
							
							
								
								
								
								
								
									
									
								
							
						
						
									
										543
									
								
								packages/dta/dta.ts
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,543 @@ | ||||
| import { DenseWorkSheet, WorkBook, type utils } from 'xlsx'; | ||||
| export { parse, set_utils }; | ||||
| 
 | ||||
| let _utils: typeof utils; | ||||
| /** Set internal instance of `utils` | ||||
|  * | ||||
|  * Usage: | ||||
|  * | ||||
|  * ```js
 | ||||
|  * const XLSX = require("xlsx"); | ||||
|  * const DTA = require("dta"); | ||||
|  * DTA.set_utils(XLSX.utils); | ||||
|  * ``` | ||||
|  * | ||||
|  * @param utils utils object | ||||
|  */ | ||||
| function set_utils(utils: any): void { | ||||
|   _utils = utils; | ||||
| } | ||||
| 
 | ||||
| interface Payload { | ||||
|   /** Offset */ | ||||
|   ptr: number; | ||||
| 
 | ||||
|   /** Raw data */ | ||||
|   raw: Uint8Array; | ||||
| 
 | ||||
|   /** Latin-1 encoded */ | ||||
|   str: string; | ||||
| 
 | ||||
|   /** DataView */ | ||||
|   dv: DataView; | ||||
| } | ||||
| 
 | ||||
| function u8_to_dataview(array: Uint8Array): DataView { return new DataView(array.buffer, array.byteOffset, array.byteLength); } | ||||
| function valid_inc(p: Payload, n: string): boolean { | ||||
|   if(p.str.slice(p.ptr, p.ptr + n.length) != n) return false; | ||||
|   p.ptr += n.length; | ||||
|   return true; | ||||
| } | ||||
| 
 | ||||
| function skip_end(p: Payload, n: string): void { | ||||
|   const idx = p.str.indexOf(n, p.ptr); | ||||
|   if(idx == -1) throw new Error(`Expected ${n} after offset ${p.ptr}`); | ||||
|   p.ptr = idx + n.length; | ||||
| } | ||||
| function slice_end(p: Payload, n: string): Payload { | ||||
|   const idx = p.str.indexOf(n, p.ptr); | ||||
|   if(idx == -1) throw new Error(`Expected ${n} after offset ${p.ptr}`); | ||||
|   const raw = p.raw.slice(p.ptr, idx); | ||||
|   const res = { | ||||
|     ptr: 0, | ||||
|     raw, | ||||
|     str: p.str.slice(p.ptr, idx), | ||||
|     dv: u8_to_dataview(raw) | ||||
|   }; | ||||
|   p.ptr = idx + n.length; | ||||
|   return res; | ||||
| } | ||||
| 
 | ||||
| function read_f64(p: Payload, LE: boolean): number | null { | ||||
|   p.ptr += 8; | ||||
|   const d = p.dv.getFloat64(p.ptr - 8, LE); | ||||
|   return d > 8.988e+307 ? null : d; | ||||
| } | ||||
| function read_f32(p: Payload, LE: boolean): number | null { | ||||
|   p.ptr += 4; | ||||
|   const d = p.dv.getFloat32(p.ptr - 4, LE); | ||||
|   return d > 1.701e+38 ? null : d; | ||||
| 
 | ||||
| } | ||||
| function read_u32(p: Payload, LE: boolean) { | ||||
|   p.ptr += 4; | ||||
|   return p.dv.getUint32(p.ptr - 4, LE); | ||||
| } | ||||
| function read_i32(p: Payload, LE: boolean): number | null { | ||||
|   p.ptr += 4; | ||||
|   const u = p.dv.getInt32(p.ptr - 4, LE); | ||||
|   return u > 0x7fffffe4 ? null : u; | ||||
| } | ||||
| function read_u16(p: Payload, LE: boolean) { | ||||
|   p.ptr += 2; | ||||
|   return p.dv.getUint16(p.ptr - 2, LE); | ||||
| } | ||||
| function read_i16(p: Payload, LE: boolean): number | null { | ||||
|   p.ptr += 2; | ||||
|   const u = p.dv.getInt16(p.ptr - 2, LE); | ||||
|   return u > 32740 ? null : u; | ||||
| } | ||||
| function read_u8(p: Payload) { | ||||
|   return p.raw[p.ptr++]; | ||||
| } | ||||
| function read_i8(p: Payload): number | null { | ||||
|   let u = p.raw[p.ptr++]; | ||||
|   u = u < 128 ? u : u - 256; | ||||
|   return u > 100 ? null : u; | ||||
| } | ||||
| 
 | ||||
| const SUPPORTED_VERSIONS_TAGGED = [ | ||||
|   "117", // stata 13
 | ||||
|   "118", // stata 14-18
 | ||||
|   // "119", // stata 15/16/17/18 (> 32767 variables)
 | ||||
|   // "120", // stata 18 (<= 32767, with aliases)
 | ||||
|   // "121", // stata 18 (> 32767, with aliases)
 | ||||
| ]; | ||||
| 
 | ||||
| function parse_tagged(raw: Uint8Array): WorkBook { | ||||
|   const err = ("Not a DTA file"); | ||||
|   /* sadly the web zealots decided to abandon binary strings */ | ||||
|   const str = new TextDecoder('latin1').decode(raw); | ||||
| 
 | ||||
|   const d: Payload = { | ||||
|     ptr: 0, | ||||
|     raw, | ||||
|     str, | ||||
|     dv: u8_to_dataview(raw) | ||||
|   } | ||||
| 
 | ||||
|   let vers: number = 118; | ||||
|   let LE: boolean = true; | ||||
|   let nvar: number = 0, nobs: number = 0, nobs_lo = 0, nobs_hi = 0; | ||||
|   let label: string = "", timestamp: string = ""; | ||||
|   const var_types: number[] = []; | ||||
|   const var_names: string[] = []; | ||||
|   const formats: string[] = []; | ||||
| 
 | ||||
|   /* 5. Dataset format definition */ | ||||
|   if(!valid_inc(d, "<stata_dta>")) throw err; | ||||
| 
 | ||||
|   /* 5.1 Header <header> */ | ||||
|   { | ||||
|     if(!valid_inc(d, "<header>")) throw err; | ||||
| 
 | ||||
|     /* <release> */ | ||||
|     { | ||||
|       if(!valid_inc(d, "<release>")) throw err; | ||||
|       const res = slice_end(d, "</release>"); | ||||
|       if(SUPPORTED_VERSIONS_TAGGED.indexOf(res.str) == -1) throw (`Unsupported DTA ${res.str} file`); | ||||
|       vers = +res.str; | ||||
|     } | ||||
| 
 | ||||
|     /* <byteorder> */ | ||||
|     { | ||||
|       if(!valid_inc(d, "<byteorder>")) throw err; | ||||
|       const res = slice_end(d, "</byteorder>"); | ||||
|       switch(res.str) { | ||||
|         case "MSF": LE = false; break; | ||||
|         case "LSF": LE = true; break; | ||||
|         default: throw (`Unsupported byteorder ${res.str}`); | ||||
|       } | ||||
|     } | ||||
| 
 | ||||
|     /* <K> */ | ||||
|     { | ||||
|       if(!valid_inc(d, "<K>")) throw err; | ||||
|       const res = slice_end(d, "</K>"); | ||||
|       nvar = read_u16(res, LE); | ||||
|     } | ||||
| 
 | ||||
|     /* <N> */ | ||||
|     { | ||||
|       if(!valid_inc(d, "<N>")) throw err; | ||||
|       const res = slice_end(d, "</N>"); | ||||
|       if(vers == 117) nobs = nobs_lo = read_u32(res, LE); | ||||
|       else { | ||||
|         const lo = read_u32(res, LE), hi = read_u32(res, LE); | ||||
|         nobs = LE ? ((nobs_lo = lo) + (nobs_hi = hi) * Math.pow(2,32)) : ((nobs_lo = hi) + (nobs_hi = lo) * Math.pow(2,32)); | ||||
|       } | ||||
|       if(nobs > 1e6) console.error(`More than 1 million observations -- extra rows will be dropped`); | ||||
|     } | ||||
| 
 | ||||
|     /* <label> */ | ||||
|     { | ||||
|       if(!valid_inc(d, "<label>")) throw err; | ||||
|       const res = slice_end(d, "</label>"); | ||||
|       const w = vers >= 118 ? 2 : 1; | ||||
|       const strlen = w == 1 ? read_u8(res) : read_u16(res, LE); | ||||
|       if(strlen + w != res.str.length) throw (`Expected string length ${strlen} but actual length was ${res.str.length - w}`); | ||||
|       if(strlen > 0) label = new TextDecoder().decode(res.raw.slice(w)); | ||||
|     } | ||||
| 
 | ||||
|     /* <timestamp> */ | ||||
|     { | ||||
|       if(!valid_inc(d, "<timestamp>")) throw err; | ||||
|       const res = slice_end(d, "</timestamp>"); | ||||
|       const strlen = read_u8(res); | ||||
|       if(strlen + 1 != res.str.length) throw (`Expected string length ${strlen} but actual length was ${res.str.length - 1}`); | ||||
|       if(strlen > 0) timestamp = res.str.slice(1); | ||||
|     } | ||||
| 
 | ||||
|     if(!valid_inc(d, "</header>")) throw err; | ||||
|   } | ||||
| 
 | ||||
|   /* 5.2 Map <map> */ | ||||
|   { | ||||
|     /* TODO: validate map? */ | ||||
|     if(!valid_inc(d, "<map>")) throw err; | ||||
|     /* 14 8-byte offsets for: | ||||
|       <stata_data> | ||||
|       <map> | ||||
|       <variable_types> | ||||
|       <varnames> | ||||
|       <sortlist> | ||||
|       <formats> | ||||
|       <value_label_names> | ||||
|       <variable_labels> | ||||
|       <characteristics> | ||||
|       <data> | ||||
|       <strls> | ||||
|       <value_labels> | ||||
|       </stata_data> | ||||
|       EOF | ||||
|     */ | ||||
|     skip_end(d, "</map>"); | ||||
|   } | ||||
| 
 | ||||
|   let stride = 0; | ||||
|   /* 5.3 Variable types <variable_types> */ | ||||
|   { | ||||
|     if(!valid_inc(d, "<variable_types>")) throw err; | ||||
|     const res = slice_end(d, "</variable_types>"); | ||||
|     if(res.raw.length != 2 * nvar) throw (`Expected variable_types length ${nvar * 2}, found ${res.raw.length}`); | ||||
|     while(res.ptr < res.raw.length) { | ||||
|       const type = read_u16(res, LE); | ||||
|       var_types.push(type); | ||||
|       if(type >= 1 && type <= 2045) stride += type; | ||||
|       else switch(type) { | ||||
|         case 32768: stride += 8; break; | ||||
|         case 65526: stride += 8; break; | ||||
|         case 65527: stride += 4; break; | ||||
|         case 65528: stride += 4; break; | ||||
|         case 65529: stride += 2; break; | ||||
|         case 65530: stride += 1; break; | ||||
|         default: throw (`Unsupported field type ${type}`); | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   /* 5.4 Variable names <varnames> */ | ||||
|   { | ||||
|     if(!valid_inc(d, "<varnames>")) throw err; | ||||
|     const res = slice_end(d, "</varnames>"); | ||||
|     const w = vers >= 118 ? 129 : 33; | ||||
|     if(res.raw.length != w * nvar) throw (`Expected variable_types length ${nvar * w}, found ${res.raw.length}`); | ||||
|     while(res.ptr < res.raw.length) { | ||||
|       const name = new TextDecoder().decode(res.raw.slice(res.ptr, res.ptr + w)); | ||||
|       res.ptr += w; | ||||
|       var_names.push(name.replace(/\x00[\s\S]*/,"")); | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   /* 5.5 Sort order of observations <sortlist> */ | ||||
|   { | ||||
|     /* TODO: check sort list? */ | ||||
|     if(!valid_inc(d, "<sortlist>")) throw err; | ||||
|     const res = slice_end(d, "</sortlist>"); | ||||
|     if(res.raw.length != 2 * nvar + 2) throw (`Expected sortlist length ${nvar * 2 + 2}, found ${res.raw.length}`); | ||||
|   } | ||||
| 
 | ||||
|   /* 5.6 Display formats <formats> */ | ||||
|   { | ||||
|     if(!valid_inc(d, "<formats>")) throw err; | ||||
|     const res = slice_end(d, "</formats>"); | ||||
|     const w = vers >= 118 ? 57 : 49; | ||||
|     if(res.raw.length != w * nvar) throw (`Expected formats length ${nvar * w}, found ${res.raw.length}`); | ||||
|     while(res.ptr < res.raw.length) { | ||||
|       const name = new TextDecoder().decode(res.raw.slice(res.ptr, res.ptr + w)); | ||||
|       res.ptr += w; | ||||
|       formats.push(name.replace(/\x00[\s\S]*/,"")); | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   /* TODO: <value_label_names> */ | ||||
|   { | ||||
|     if(!valid_inc(d, "<value_label_names>")) throw err; | ||||
|     const w = vers >= 118 ? 129 : 33; | ||||
|     const res = slice_end(d, "</value_label_names>"); | ||||
|   } | ||||
| 
 | ||||
|   /* TODO: <variable_labels> */ | ||||
|   { | ||||
|     if(!valid_inc(d, "<variable_labels>")) throw err; | ||||
|     const w = vers >= 118 ? 321 : 81; | ||||
|     const res = slice_end(d, "</variable_labels>"); | ||||
|   } | ||||
| 
 | ||||
|   /* 5.9 Characteristics <characteristics> */ | ||||
|   { | ||||
|     if(!valid_inc(d, "<characteristics>")) throw err; | ||||
|     while(d.str.slice(d.ptr, d.ptr + 4) == "<ch>") { | ||||
|       d.ptr += 4; | ||||
|       const len = read_u32(d, LE); | ||||
|       d.ptr += len; | ||||
|       if(!valid_inc(d, "</ch>")) throw err; | ||||
|     } | ||||
|     if(!valid_inc(d, "</characteristics>")) throw err; | ||||
|   } | ||||
| 
 | ||||
|   const ws: DenseWorkSheet = (_utils.aoa_to_sheet([var_names], {dense: true}) as DenseWorkSheet); | ||||
| 
 | ||||
|   var ptrs: Array<[number, number, Uint8Array]> = [] | ||||
|   /* 5.10 Data <data> */ | ||||
|   { | ||||
|     if(!valid_inc(d, "<data>")) throw err; | ||||
|     for(let R = 0; R < nobs; ++R) { | ||||
|       const row: any[] = []; | ||||
|       for(let C = 0; C < nvar; ++C) { | ||||
|         let t = var_types[C]; | ||||
|         // TODO: formats, dta_12{0,1} aliases?
 | ||||
|         if(t >= 1 && t <= 2045) { | ||||
|           /* NOTE: dta_117 restricts strf to ASCII */ | ||||
|           let s = new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + t)); | ||||
|           s = s.replace(/\x00[\s\S]*/,""); | ||||
|           row[C] = s; | ||||
|           d.ptr += t; | ||||
|         } else switch(t) { | ||||
|           case 65526: row[C] = read_f64(d, LE); break; | ||||
|           case 65527: row[C] = read_f32(d, LE); break; | ||||
|           case 65528: row[C] = read_i32(d, LE); break; | ||||
|           case 65529: row[C] = read_i16(d, LE); break; | ||||
|           case 65530: row[C] = read_i8(d); break; | ||||
|           case 32768: { | ||||
|             row[C] = "##SheetJStrL##"; | ||||
|             ptrs.push([R+1,C, d.raw.slice(d.ptr, d.ptr + 8)]); | ||||
|             d.ptr += 8; | ||||
|           } break; | ||||
|           default: throw (`Unsupported field type ${t} for ${var_names[C]}`); | ||||
|         } | ||||
|       } | ||||
|       _utils.sheet_add_aoa(ws, [row], {origin: -1, sheetStubs: true}); | ||||
|     } | ||||
|     if(!valid_inc(d, "</data>")) throw err; | ||||
|   } | ||||
| 
 | ||||
|   /* 5.11 StrLs <strls> */ | ||||
|   { | ||||
|     if(!valid_inc(d, "<strls>")) throw err; | ||||
| 
 | ||||
|     const strl_tbl: string[][] = []; | ||||
|       while(d.raw[d.ptr] == 71 /* G */) { | ||||
|       if(!valid_inc(d, "GSO")) throw err; | ||||
|       const v = read_u32(d, LE); | ||||
|       let o = 0; | ||||
|       if(vers == 117) o = read_u32(d, LE); | ||||
|       else { | ||||
|         const lo = read_u32(d, LE), hi = read_u32(d, LE); | ||||
|         o = LE ? (lo + hi * Math.pow(2,32)) : (hi + lo * Math.pow(2,32)); | ||||
|         if(o > 1e6) console.error(`More than 1 million observations -- data will be dropped`); | ||||
|       } | ||||
|       const t = read_u8(d); | ||||
|       const len = read_u32(d, LE); | ||||
|       if(!strl_tbl[o]) strl_tbl[o] = []; | ||||
|       let str = ""; | ||||
|       if(t == 129) { | ||||
|         // TODO: codepage
 | ||||
|         str = new TextDecoder("latin1").decode(d.raw.slice(d.ptr, d.ptr + len)); | ||||
|         d.ptr += len; | ||||
|       } else { | ||||
|         str = new TextDecoder("latin1").decode(d.raw.slice(d.ptr, d.ptr + len)).replace(/\x00$/,""); | ||||
|         d.ptr += len; | ||||
|       } | ||||
|       strl_tbl[o][v] = str; | ||||
|     } | ||||
|     if(!valid_inc(d, "</strls>")) throw err; | ||||
| 
 | ||||
|     ptrs.forEach(([R,C,buf]) => { | ||||
|       const dv = u8_to_dataview(buf); | ||||
|       let v = 0, o = 0; | ||||
|       switch(vers) { | ||||
|         case 117: { // v(4) o(4)
 | ||||
|           v = dv.getUint32(0, LE); | ||||
|           o = dv.getUint32(4, LE); | ||||
|         } break; | ||||
| 
 | ||||
|         case 118: case 120: { // v(2) o(6)
 | ||||
|           v = dv.getUint16(0, LE); | ||||
|           const o1 = dv.getUint16(2, LE), o2 = dv.getUint32(4, LE); | ||||
|           o = LE ? o1 + o2 * 65536 : o2 + o1 * (2**32); | ||||
|         } break; | ||||
| 
 | ||||
|         case 119: case 121: { // v(3) o(5)
 | ||||
|           const v1 = dv.getUint16(0, LE), v2 = buf[2]; | ||||
|           v = LE ? v1 + (v2 << 16) : v2 + (v1 << 8); | ||||
|           const o1 = buf[3], o2 = dv.getUint32(4, LE); | ||||
|           o = LE ? o1 + o2 * 256 : o2 + o1 * (2**32); | ||||
|         } | ||||
|       } | ||||
|       ws["!data"][R][C].v = strl_tbl[o][v]; | ||||
|     }); | ||||
|   } | ||||
| 
 | ||||
|   /* 5.12 Value labels <value_labels> */ | ||||
|   { | ||||
|     if(!valid_inc(d, "<value_labels>")) throw err; | ||||
|     const res = slice_end(d, "</value_labels>"); | ||||
|   } | ||||
| 
 | ||||
|   if(!valid_inc(d, "</stata_dta>")) throw err; | ||||
| 
 | ||||
|   const wb = _utils.book_new(); | ||||
|   _utils.book_append_sheet(wb, ws, "Sheet1"); | ||||
|   return wb; | ||||
| } | ||||
| 
 | ||||
| function parse_legacy(raw: Uint8Array): WorkBook { | ||||
|   let vers: number = raw[0]; | ||||
|   switch(vers) { | ||||
|     case 102: // stata 1
 | ||||
|     case 112: // stata 8/9
 | ||||
|       throw (`Unsupported DTA ${vers} file`); | ||||
| 
 | ||||
|     case 103: // stata 2/3
 | ||||
|     case 104: // stata 4
 | ||||
|     case 105: // stata 5
 | ||||
|     case 108: // stata 6
 | ||||
|     case 110: // stata 7
 | ||||
|     case 111: // stata 7
 | ||||
|     case 113: // stata 8/9
 | ||||
|     case 114: // stata 10/11
 | ||||
|     case 115: // stata 12
 | ||||
|       break; | ||||
| 
 | ||||
|     default: throw new Error("Not a DTA file"); | ||||
|   } | ||||
| 
 | ||||
|   const d: Payload = { | ||||
|     ptr: 1, | ||||
|     raw, | ||||
|     str:"", | ||||
|     dv: u8_to_dataview(raw) | ||||
|   } | ||||
| 
 | ||||
|   let LE: boolean = true; | ||||
|   let nvar: number = 0, nobs: number = 0; | ||||
|   let label: string = "", timestamp: string = ""; | ||||
|   const var_types: number[] = []; | ||||
|   const var_names: string[] = []; | ||||
|   const formats: string[] = []; | ||||
| 
 | ||||
|   /* 5.1 Header */ | ||||
|   { | ||||
|     const byteorder = read_u8(d); | ||||
|     switch(byteorder) { | ||||
|       case 1: LE = false; break; | ||||
|       case 2: LE = true; break; | ||||
|       default: throw (`DTA ${vers} Unexpected byteorder ${byteorder}`); | ||||
|     } | ||||
| 
 | ||||
|     let byte = read_u8(d); | ||||
|     if(byte != 1) throw (`DTA ${vers} Unexpected filetype ${byte}`); | ||||
|     // NOTE: dta_105 technically supports filetype 2
 | ||||
| 
 | ||||
|     d.ptr++; // "unused"
 | ||||
|     nvar = read_u16(d, LE); | ||||
|     nobs = read_u32(d, LE); | ||||
|     d.ptr += (vers >= 108 ? 81 : 32); // TODO: data_label
 | ||||
|     if(vers >= 105) d.ptr += 18; // TODO: time_stamp
 | ||||
|   } | ||||
| 
 | ||||
|   /* 5.2 Descriptors */ | ||||
|   { | ||||
|     let C = 0; | ||||
| 
 | ||||
|     // typlist
 | ||||
|     for(C = 0; C < nvar; ++C) var_types.push(read_u8(d)); | ||||
| 
 | ||||
|     // varlist
 | ||||
|     const w = vers >= 110 ? 33 : 9; | ||||
|     for(C = 0; C < nvar; ++C) { | ||||
|       var_names.push(new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + w)).replace(/\x00[\s\S]*$/,"")); | ||||
|       d.ptr += w; | ||||
|     } | ||||
| 
 | ||||
|     // srtlist
 | ||||
|     d.ptr += 2*(nvar + 1); | ||||
| 
 | ||||
|     // fmtlist
 | ||||
|     const fw = (vers >= 114 ? 49 : vers >= 105 ? 12 : 7); | ||||
|     for(C = 0; C < nvar; ++C) { | ||||
|       formats.push(new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + fw)).replace(/\x00[\s\S]*$/,"")); | ||||
|       d.ptr += fw; | ||||
|     } | ||||
| 
 | ||||
|     // lbllist
 | ||||
|     d.ptr += (vers >= 110 ? 33 : 9) * nvar; | ||||
|   } | ||||
| 
 | ||||
|   /* 5.3 Variable labels */ | ||||
|   // TODO: should these names be used in the worksheet?
 | ||||
|   d.ptr += (vers >= 106 ? 81 : 32) * nvar; | ||||
| 
 | ||||
|   /* 5.4 Expansion fields */ | ||||
|   if(vers >= 105) while(d.ptr < d.raw.length) { | ||||
|     const dt = read_u8(d), len = (vers >= 111 ? read_u32 : read_u16)(d, LE); | ||||
|     if(dt == 0 && len == 0) break; | ||||
|     d.ptr += len; | ||||
|   } | ||||
| 
 | ||||
|   const ws: DenseWorkSheet = (_utils.aoa_to_sheet([var_names], {dense: true}) as DenseWorkSheet); | ||||
| 
 | ||||
|   /* 5.5 Data */ | ||||
|   for(let R = 0; R < nobs; ++R) { | ||||
|     const row: any[] = []; | ||||
|     for(let C = 0; C < nvar; ++C) { | ||||
|       let t = var_types[C]; | ||||
|       // TODO: data type processing
 | ||||
|       if(vers >= 111 && t >= 1 && t <= 244) { | ||||
|         /* NOTE: dta_117 restricts strf to ASCII */ | ||||
|         let s = new TextDecoder().decode(d.raw.slice(d.ptr, d.ptr + t)); | ||||
|         s = s.replace(/\x00[\s\S]*/,""); | ||||
|         row[C] = s; | ||||
|         d.ptr += t; | ||||
|       } else switch(t) { | ||||
|         case 251: case 0x62: row[C] = read_i8(d); break; // byte
 | ||||
|         case 252: case 0x69: row[C] = read_i16(d, LE); break; // int
 | ||||
|         case 253: case 0x6c: row[C] = read_i32(d, LE); break; // long
 | ||||
|         case 254: case 0x66: row[C] = read_f32(d, LE); break; // float
 | ||||
|         case 255: case 0x64: row[C] = read_f64(d, LE); break; // double
 | ||||
|         default: throw (`Unsupported field type ${t} for ${var_names[C]}`); | ||||
|       } | ||||
|     } | ||||
|     _utils.sheet_add_aoa(ws, [row], {origin: -1, sheetStubs: true}); | ||||
|   } | ||||
| 
 | ||||
|   /* 5.6 Value labels */ | ||||
|   // EOF or labels
 | ||||
| 
 | ||||
|   const wb: WorkBook = _utils.book_new(); | ||||
|   _utils.book_append_sheet(wb, ws, "Sheet1"); | ||||
|   return wb; | ||||
| } | ||||
| 
 | ||||
| /** Parse DTA file | ||||
|  * | ||||
|  * NOTE: In NodeJS, `Buffer` extends `Uint8Array` | ||||
|  * | ||||
|  * @param {Uint8Array} data File data | ||||
|  */ | ||||
| function parse(data: Uint8Array): WorkBook { | ||||
|   if(data[0] >= 102 && data[0] <= 115) return parse_legacy(data); | ||||
|   if(data[0] === 60) return parse_tagged(data); | ||||
|   throw new Error("Not a DTA file"); | ||||
| } | ||||
							
								
								
									
										36
									
								
								packages/dta/package.json
									
									
									
									
									
										Normal file
									
								
							
							
								
								
								
								
								
									
									
								
							
						
						
									
										36
									
								
								packages/dta/package.json
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,36 @@ | ||||
| { | ||||
| 	"name": "dta", | ||||
| 	"version": "0.0.1", | ||||
| 	"author": "sheetjs", | ||||
| 	"description": "Stata .dta codecs for SheetJS Common Spreadsheet Format", | ||||
| 	"bin": { | ||||
| 		"dta2csv": "./bin/dta2csv.njs" | ||||
| 	}, | ||||
| 	"main": "dist/dta.js", | ||||
| 	"types": "types", | ||||
| 	"files": [ | ||||
| 		"dist/" | ||||
| 	], | ||||
| 	"repository": { | ||||
| 		"type": "git", | ||||
| 		"url": "https://git.sheetjs.com/SheetJS/sheetjs", | ||||
| 		"directory": "packages/dta" | ||||
| 	}, | ||||
| 	"scripts": { | ||||
| 		"test": "make test", | ||||
| 		"build": "make", | ||||
| 		"lint": "make fullint", | ||||
| 		"dtslint": "dtslint types" | ||||
| 	}, | ||||
| 	"homepage": "https://sheetjs.com/", | ||||
| 	"bugs": { | ||||
| 		"url": "https://git.sheetjs.com/SheetJS/sheetjs/issues" | ||||
| 	}, | ||||
| 	"license": "Apache-2.0", | ||||
| 	"engines": { | ||||
| 		"node": ">=12.0" | ||||
| 	}, | ||||
| 	"devDependencies": { | ||||
| 		"xlsx": "https://cdn.sheetjs.com/xlsx-0.20.0/xlsx-0.20.0.tgz" | ||||
| 	} | ||||
| } | ||||
							
								
								
									
										32
									
								
								packages/dta/test.js
									
									
									
									
									
										Normal file
									
								
							
							
								
								
								
								
								
									
									
								
							
						
						
									
										32
									
								
								packages/dta/test.js
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,32 @@ | ||||
| /* eslint-env mocha, node, es6 */ | ||||
| const fs = require("fs"), assert = require("assert"); | ||||
| 
 | ||||
| const DTA = require("./"); | ||||
| const XLSX = require("xlsx"); | ||||
| DTA.set_utils(XLSX.utils); | ||||
| 
 | ||||
| const test_folders = [ | ||||
|   "test_files" | ||||
| ]; | ||||
| for(let tF of test_folders) describe(tF, () => { | ||||
|   const test_files = fs.readdirSync(tF); | ||||
|   for(let tf of test_files) { | ||||
|     if(tf.endsWith("csv")) it(`${tf.replace(".csv", "")} [CSV]`, () => { | ||||
|       const buf = fs.readFileSync(`${tF}/${tf.replace(".csv", "")}`); | ||||
|       const wb = DTA.parse(buf); | ||||
|       assert(wb.SheetNames.length > 0); | ||||
|       /* stata will represent unspecified values as single spaces */ | ||||
|       wb.Sheets[wb.SheetNames[0]]["!data"].forEach(row => row.forEach(cell => {if(cell.t == "z") {cell.t = "s"; cell.v = " ";}})); | ||||
|       const csvstr = XLSX.utils.sheet_to_csv(wb.Sheets[wb.SheetNames[0]]); | ||||
|       const baseline = fs.readFileSync(`${tF}/${tf}`, "utf8").replace(/[\r\n]+/g,"\n"); | ||||
|       assert.equal(csvstr.trim(), baseline.trim()); | ||||
|     }); | ||||
|     if(!tf.endsWith("dta")) continue; | ||||
|     it(tf, () => { | ||||
|       const buf = fs.readFileSync(`${tF}/${tf}`); | ||||
|       const wb = DTA.parse(buf); | ||||
|       assert(wb.SheetNames.length > 0); | ||||
|     }); | ||||
|   } | ||||
| }); | ||||
| 
 | ||||
							
								
								
									
										23
									
								
								packages/dta/types/index.d.ts
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
								
								
								
								
								
									
									
								
							
						
						
									
										23
									
								
								packages/dta/types/index.d.ts
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @ -0,0 +1,23 @@ | ||||
| import type { WorkBook } from "xlsx"; | ||||
| 
 | ||||
| /** Set internal instance of `utils` | ||||
|  * | ||||
|  * Usage: | ||||
|  * | ||||
|  * ```js
 | ||||
|  * const XLSX = require("xlsx"); | ||||
|  * const DTA = require("dta"); | ||||
|  * DTA.set_utils(XLSX.utils); | ||||
|  * ``` | ||||
|  * | ||||
|  * @param utils utils object | ||||
|  */ | ||||
| export function set_utils(utils: any): void; | ||||
| 
 | ||||
| /** Parse DTA file | ||||
|  * | ||||
|  * NOTE: In NodeJS, `Buffer` extends `Uint8Array` | ||||
|  * | ||||
|  * @param {Uint8Array} data File data | ||||
|  */ | ||||
| export function parse(data: Uint8Array): WorkBook | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user