You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

416 lines
12 KiB

1 year ago
1 year ago
1 year ago
1 year ago
  1. import xarray as xr
  2. import dask.dataframe as df
  3. import pandas as pd
  4. import numpy as np
  5. from collections import OrderedDict
  6. from functools import partial
  7. import copy
  8. import glob
  9. import os
  10. from datetime import datetime
  11. def _read_globals_attrs(variable_attrs, context=None):
  12. """Combine attributes from different variables according to combine_attrs"""
  13. if not variable_attrs:
  14. # no attributes to merge
  15. return None
  16. from xarray.core.utils import equivalent
  17. result = {}
  18. dropped_attrs = OrderedDict()
  19. for attrs in variable_attrs:
  20. result.update(
  21. {
  22. key: value
  23. for key, value in attrs.items()
  24. if key not in result and key not in dropped_attrs.keys()
  25. }
  26. )
  27. result = {
  28. key: value
  29. for key, value in result.items()
  30. if key not in attrs or equivalent(attrs[key], value)
  31. }
  32. dropped_attrs.update(
  33. {
  34. key: []
  35. for key in attrs if key not in result
  36. }
  37. )
  38. for attrs in variable_attrs:
  39. dropped_attrs.update(
  40. {
  41. key: np.append(dropped_attrs[key], attrs[key])
  42. for key in dropped_attrs.keys()
  43. }
  44. )
  45. scan_attrs = OrderedDict()
  46. scan_length = []
  47. for attrs_key in dropped_attrs.keys():
  48. flag = True
  49. for key in scan_attrs.keys():
  50. if equivalent(scan_attrs[key], dropped_attrs[attrs_key]):
  51. flag = False
  52. result.update({attrs_key: key})
  53. break
  54. if flag:
  55. scan_attrs.update({
  56. attrs_key: dropped_attrs[attrs_key]
  57. })
  58. scan_length = np.append(scan_length, len(dropped_attrs[attrs_key]))
  59. result.update(
  60. {
  61. key: value
  62. for key, value in scan_attrs.items()
  63. }
  64. )
  65. result.update(
  66. {
  67. "scanAxis": list(scan_attrs.keys()),
  68. "scanAxisLength": scan_length,
  69. }
  70. )
  71. # if result['scanAxis'] == []:
  72. # result['scanAxis'] = ['runs',]
  73. return result
  74. def _read_shot_number_from_hdf5(x):
  75. filePath = x.encoding["source"]
  76. shotNum = filePath.split("_")[-1].split("_")[-1].split(".")[0]
  77. return x.assign(shotNum=shotNum)
  78. def _assign_scan_axis_partial(x, datesetOfGlobal, fullFilePath):
  79. scanAxis = datesetOfGlobal.scanAxis
  80. filePath = x.encoding["source"].replace("\\", "/")
  81. shotNum = np.where(fullFilePath==filePath)
  82. shotNum = np.squeeze(shotNum)
  83. # shotNum = filePath.split("_")[-1].split("_")[-1].split(".")[0]
  84. x = x.assign(shotNum=filePath.split("_")[-1].split("_")[-1].split(".")[0])
  85. x = x.expand_dims(list(scanAxis))
  86. return x.assign_coords(
  87. {
  88. key: np.atleast_1d(np.atleast_1d(datesetOfGlobal.attrs[key])[int(shotNum)])
  89. for key in scanAxis
  90. }
  91. )
  92. def _update_globals_attrs(variable_attrs, context=None):
  93. pass
  94. def update_hdf5_file():
  95. pass
  96. def read_hdf5_file(filePath, group=None, datesetOfGlobal=None, preprocess=None, join="outer", parallel=True, engine="h5netcdf", phony_dims="access", excludeAxis=[], maxFileNum=None, **kwargs):
  97. filePath = np.sort(np.atleast_1d(filePath))
  98. filePathAbs = []
  99. for i in range(len(filePath)):
  100. filePathAbs.append(os.path.abspath(filePath[i]).replace("\\", "/"))
  101. fullFilePath = []
  102. for i in range(len(filePathAbs)):
  103. fullFilePath.append(list(np.sort(glob.glob(filePathAbs[i]))))
  104. fullFilePath = np.array(fullFilePath).flatten()
  105. for i in range(len(fullFilePath)):
  106. fullFilePath[i] = fullFilePath[i].replace("\\", "/")
  107. if not maxFileNum is None:
  108. fullFilePath = fullFilePath[0:int(maxFileNum)]
  109. kwargs.update(
  110. {
  111. 'join': join,
  112. 'parallel': parallel,
  113. 'engine': engine,
  114. 'phony_dims': phony_dims,
  115. 'group': group
  116. }
  117. )
  118. if datesetOfGlobal is None:
  119. datesetOfGlobal = xr.open_mfdataset(
  120. fullFilePath,
  121. group="globals",
  122. concat_dim="fileNum",
  123. combine="nested",
  124. preprocess=_read_shot_number_from_hdf5,
  125. engine="h5netcdf",
  126. phony_dims="access",
  127. combine_attrs=_read_globals_attrs,
  128. parallel=True, )
  129. datesetOfGlobal.attrs['scanAxis'] = np.setdiff1d(datesetOfGlobal.attrs['scanAxis'], excludeAxis)
  130. _assgin_scan_axis = partial(_assign_scan_axis_partial, datesetOfGlobal=datesetOfGlobal, fullFilePath=fullFilePath)
  131. if preprocess is None:
  132. kwargs.update({'preprocess':_assgin_scan_axis})
  133. else:
  134. kwargs.update({'preprocess':preprocess})
  135. ds = xr.open_mfdataset(fullFilePath, **kwargs)
  136. newDimKey = np.append(['x', 'y', 'z'], [ chr(i) for i in range(97, 97+23)])
  137. oldDimKey = np.sort(
  138. [
  139. key
  140. for key in ds.dims
  141. if not key in datesetOfGlobal.scanAxis
  142. ]
  143. )
  144. renameDict = {
  145. oldDimKey[j]: newDimKey[j]
  146. for j in range(len(oldDimKey))
  147. }
  148. ds = ds.rename_dims(renameDict)
  149. ds.attrs = copy.deepcopy(datesetOfGlobal.attrs)
  150. return ds
  151. def _assign_scan_axis_partial_and_remove_everything(x, datesetOfGlobal, fullFilePath):
  152. scanAxis = datesetOfGlobal.scanAxis
  153. filePath = x.encoding["source"].replace("\\", "/")
  154. shotNum = np.where(fullFilePath==filePath)
  155. shotNum = np.squeeze(shotNum)
  156. runTime = _read_run_time_from_hdf5(x)
  157. x = xr.Dataset(data_vars={'runTime':runTime})
  158. x = x.expand_dims(list(scanAxis))
  159. return x.assign_coords(
  160. {
  161. key: np.atleast_1d(np.atleast_1d(datesetOfGlobal.attrs[key])[int(shotNum)])
  162. for key in scanAxis
  163. }
  164. )
  165. def _read_run_time_from_hdf5(x):
  166. runTime = datetime.strptime(x.attrs['run time'], '%Y%m%dT%H%M%S')
  167. return runTime
  168. def read_hdf5_run_time(filePath, group=None, datesetOfGlobal=None, preprocess=None, join="outer", parallel=True, engine="h5netcdf", phony_dims="access", excludeAxis=[], maxFileNum=None, **kwargs):
  169. filePath = np.sort(np.atleast_1d(filePath))
  170. filePathAbs = []
  171. for i in range(len(filePath)):
  172. filePathAbs.append(os.path.abspath(filePath[i]).replace("\\", "/"))
  173. fullFilePath = []
  174. for i in range(len(filePathAbs)):
  175. fullFilePath.append(list(np.sort(glob.glob(filePathAbs[i]))))
  176. fullFilePath = np.array(fullFilePath).flatten()
  177. for i in range(len(fullFilePath)):
  178. fullFilePath[i] = fullFilePath[i].replace("\\", "/")
  179. if not maxFileNum is None:
  180. fullFilePath = fullFilePath[0:int(maxFileNum)]
  181. kwargs.update(
  182. {
  183. 'join': join,
  184. 'parallel': parallel,
  185. 'engine': engine,
  186. 'phony_dims': phony_dims,
  187. 'group': group
  188. }
  189. )
  190. if datesetOfGlobal is None:
  191. datesetOfGlobal = xr.open_mfdataset(
  192. fullFilePath,
  193. group="globals",
  194. concat_dim="fileNum",
  195. combine="nested",
  196. preprocess=_read_shot_number_from_hdf5,
  197. engine="h5netcdf",
  198. phony_dims="access",
  199. combine_attrs=_read_globals_attrs,
  200. parallel=True, )
  201. datesetOfGlobal.attrs['scanAxis'] = np.setdiff1d(datesetOfGlobal.attrs['scanAxis'], excludeAxis)
  202. _assgin_scan_axis = partial(_assign_scan_axis_partial_and_remove_everything, datesetOfGlobal=datesetOfGlobal, fullFilePath=fullFilePath)
  203. if preprocess is None:
  204. kwargs.update({'preprocess':_assgin_scan_axis})
  205. else:
  206. kwargs.update({'preprocess':preprocess})
  207. ds = xr.open_mfdataset(fullFilePath, **kwargs)
  208. newDimKey = np.append(['x', 'y', 'z'], [ chr(i) for i in range(97, 97+23)])
  209. oldDimKey = np.sort(
  210. [
  211. key
  212. for key in ds.dims
  213. if not key in datesetOfGlobal.scanAxis
  214. ]
  215. )
  216. renameDict = {
  217. oldDimKey[j]: newDimKey[j]
  218. for j in range(len(oldDimKey))
  219. }
  220. ds = ds.rename_dims(renameDict)
  221. ds.attrs = copy.deepcopy(datesetOfGlobal.attrs)
  222. return ds
  223. def read_hdf5_global(filePath, preprocess=None, join="outer", combine="nested", parallel=True, engine="h5netcdf", phony_dims="access", excludeAxis=[], maxFileNum=None, **kwargs):
  224. filePath = np.sort(np.atleast_1d(filePath))
  225. filePathAbs = []
  226. for i in range(len(filePath)):
  227. filePathAbs.append(os.path.abspath(filePath[i]).replace("\\", "/"))
  228. fullFilePath = []
  229. for i in range(len(filePathAbs)):
  230. fullFilePath.append(list(np.sort(glob.glob(filePathAbs[i]))))
  231. fullFilePath = np.array(fullFilePath).flatten()
  232. for i in range(len(fullFilePath)):
  233. fullFilePath[i] = fullFilePath[i].replace("\\", "/")
  234. if not maxFileNum is None:
  235. fullFilePath = fullFilePath[0:int(maxFileNum)]
  236. kwargs.update(
  237. {
  238. 'join': join,
  239. 'parallel': parallel,
  240. 'engine': engine,
  241. 'phony_dims': phony_dims,
  242. 'group': "globals",
  243. 'preprocess': _read_shot_number_from_hdf5,
  244. 'combine_attrs': _read_globals_attrs,
  245. 'combine':combine,
  246. 'concat_dim': "fileNum",
  247. }
  248. )
  249. datesetOfGlobal = xr.open_mfdataset(fullFilePath, **kwargs)
  250. datesetOfGlobal.attrs['scanAxis'] = np.setdiff1d(datesetOfGlobal.attrs['scanAxis'], excludeAxis)
  251. return datesetOfGlobal
  252. def _read_csv_file_pandas(filePath, **kwargs):
  253. res = pd.read_csv(filePath, **kwargs)
  254. res = xr.Dataset.from_dataframe(res).to_array().to_numpy()
  255. return res
  256. def _read_csv_file_dask(filePath, **kwargs):
  257. res = df.read_csv(filePath, **kwargs)
  258. res = xr.Dataset.from_dataframe(res).to_array().to_numpy()
  259. return res
  260. def read_csv_file(filePath, maxFileNum=None, dask='parallelized', vectorize=True, csvEngine='pandas', daskKwargs={}, csvKwargs={}, **kwargs):
  261. filePath = np.sort(np.atleast_1d(filePath))
  262. filePathAbs = []
  263. for i in range(len(filePath)):
  264. filePathAbs.append(os.path.abspath(filePath[i]).replace("\\", "/"))
  265. fullFilePath = []
  266. for i in range(len(filePathAbs)):
  267. fullFilePath.append(list(np.sort(glob.glob(filePathAbs[i]))))
  268. fullFilePath = np.array(fullFilePath).flatten()
  269. for i in range(len(fullFilePath)):
  270. fullFilePath[i] = fullFilePath[i].replace("\\", "/")
  271. if not maxFileNum is None:
  272. fullFilePath = fullFilePath[0:int(maxFileNum)]
  273. if csvEngine=='pandas':
  274. res_first = pd.read_csv(fullFilePath[0], **csvKwargs)
  275. elif csvEngine=='dask':
  276. res_first = df.read_csv(fullFilePath[0], **csvKwargs)
  277. res_first = xr.Dataset.from_dataframe(res_first)
  278. data_vars = list(res_first.keys())
  279. # print(data_vars)
  280. # print(np.shape(data_vars)[1])
  281. if len(np.shape(data_vars)) > 1:
  282. data_vars = np.array(
  283. [
  284. ''.join(data_vars[i])
  285. for i in range(np.shape(data_vars)[0])
  286. ]
  287. )
  288. fullFilePath = xr.DataArray(
  289. data=fullFilePath,
  290. dims=['fileIndex']
  291. )
  292. newDimKey = np.append(['data_vars'], list(res_first.dims.keys()))
  293. newDimKey = np.append(newDimKey, ['x', 'y', 'z'])
  294. newDimKey = np.append(newDimKey, [ chr(i) for i in range(97, 97+23)])
  295. kwargs.update(
  296. {
  297. 'dask': dask,
  298. 'vectorize': vectorize,
  299. 'output_core_dims': [newDimKey[0:len(res_first.dims) + 1]],
  300. "dask_gufunc_kwargs": daskKwargs,
  301. }
  302. )
  303. if csvEngine=='pandas':
  304. res = xr.apply_ufunc(_read_csv_file_pandas, fullFilePath, kwargs=csvKwargs, **kwargs)
  305. elif csvEngine=='dask':
  306. res = xr.apply_ufunc(_read_csv_file_dask, fullFilePath, kwargs=csvKwargs, **kwargs)
  307. res = res.assign_coords({'data_vars': data_vars})
  308. res = res.to_dataset(dim='data_vars')
  309. for key in list(res_first.coords.keys()):
  310. res = res.assign_coords({key: res_first[key]})
  311. return res