2023-04-24 13:03:23 +02:00
|
|
|
import xarray as xr
|
2023-05-22 19:35:09 +02:00
|
|
|
import dask.dataframe as df
|
|
|
|
import pandas as pd
|
2023-04-24 13:03:23 +02:00
|
|
|
import numpy as np
|
|
|
|
from collections import OrderedDict
|
|
|
|
from functools import partial
|
|
|
|
import copy
|
2023-05-07 00:38:52 +02:00
|
|
|
import glob
|
|
|
|
import os
|
2023-05-18 16:09:20 +02:00
|
|
|
from datetime import datetime
|
2023-04-24 13:03:23 +02:00
|
|
|
|
|
|
|
|
|
|
|
def _read_globals_attrs(variable_attrs, context=None):
|
|
|
|
"""Combine attributes from different variables according to combine_attrs"""
|
|
|
|
if not variable_attrs:
|
|
|
|
# no attributes to merge
|
|
|
|
return None
|
|
|
|
|
|
|
|
from xarray.core.utils import equivalent
|
|
|
|
|
|
|
|
result = {}
|
|
|
|
dropped_attrs = OrderedDict()
|
|
|
|
for attrs in variable_attrs:
|
|
|
|
result.update(
|
|
|
|
{
|
|
|
|
key: value
|
|
|
|
for key, value in attrs.items()
|
|
|
|
if key not in result and key not in dropped_attrs.keys()
|
|
|
|
}
|
|
|
|
)
|
|
|
|
result = {
|
|
|
|
key: value
|
|
|
|
for key, value in result.items()
|
|
|
|
if key not in attrs or equivalent(attrs[key], value)
|
|
|
|
}
|
|
|
|
dropped_attrs.update(
|
|
|
|
{
|
|
|
|
key: []
|
|
|
|
for key in attrs if key not in result
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
for attrs in variable_attrs:
|
|
|
|
dropped_attrs.update(
|
|
|
|
{
|
|
|
|
key: np.append(dropped_attrs[key], attrs[key])
|
|
|
|
for key in dropped_attrs.keys()
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
scan_attrs = OrderedDict()
|
|
|
|
scan_length = []
|
|
|
|
for attrs_key in dropped_attrs.keys():
|
|
|
|
flag = True
|
|
|
|
for key in scan_attrs.keys():
|
|
|
|
if equivalent(scan_attrs[key], dropped_attrs[attrs_key]):
|
|
|
|
flag = False
|
|
|
|
|
|
|
|
result.update({attrs_key: key})
|
|
|
|
|
|
|
|
break
|
|
|
|
if flag:
|
|
|
|
scan_attrs.update({
|
|
|
|
attrs_key: dropped_attrs[attrs_key]
|
|
|
|
})
|
|
|
|
scan_length = np.append(scan_length, len(dropped_attrs[attrs_key]))
|
|
|
|
|
|
|
|
result.update(
|
|
|
|
{
|
|
|
|
key: value
|
|
|
|
for key, value in scan_attrs.items()
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
result.update(
|
|
|
|
{
|
|
|
|
"scanAxis": list(scan_attrs.keys()),
|
|
|
|
"scanAxisLength": scan_length,
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
2023-05-07 00:38:52 +02:00
|
|
|
# if result['scanAxis'] == []:
|
|
|
|
# result['scanAxis'] = ['runs',]
|
|
|
|
|
2023-04-24 13:03:23 +02:00
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
def _read_shot_number_from_hdf5(x):
|
|
|
|
filePath = x.encoding["source"]
|
|
|
|
shotNum = filePath.split("_")[-1].split("_")[-1].split(".")[0]
|
|
|
|
return x.assign(shotNum=shotNum)
|
|
|
|
|
|
|
|
|
2023-05-07 00:38:52 +02:00
|
|
|
def _assign_scan_axis_partial(x, datesetOfGlobal, fullFilePath):
|
2023-04-24 13:03:23 +02:00
|
|
|
scanAxis = datesetOfGlobal.scanAxis
|
2023-05-07 00:38:52 +02:00
|
|
|
filePath = x.encoding["source"].replace("\\", "/")
|
|
|
|
shotNum = np.where(fullFilePath==filePath)
|
|
|
|
shotNum = np.squeeze(shotNum)
|
|
|
|
# shotNum = filePath.split("_")[-1].split("_")[-1].split(".")[0]
|
2023-04-24 13:03:23 +02:00
|
|
|
x = x.assign(shotNum=shotNum)
|
|
|
|
x = x.expand_dims(list(scanAxis))
|
2023-05-07 00:38:52 +02:00
|
|
|
|
2023-04-24 13:03:23 +02:00
|
|
|
return x.assign_coords(
|
|
|
|
{
|
2023-05-07 00:38:52 +02:00
|
|
|
key: np.atleast_1d(np.atleast_1d(datesetOfGlobal.attrs[key])[int(shotNum)])
|
2023-04-24 13:03:23 +02:00
|
|
|
for key in scanAxis
|
|
|
|
}
|
|
|
|
)
|
2023-05-04 13:47:33 +02:00
|
|
|
|
|
|
|
|
|
|
|
def _update_globals_attrs(variable_attrs, context=None):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
def update_hdf5_file():
|
|
|
|
pass
|
2023-04-24 13:03:23 +02:00
|
|
|
|
|
|
|
|
2023-05-16 15:51:13 +02:00
|
|
|
def read_hdf5_file(filePath, group=None, datesetOfGlobal=None, preprocess=None, join="outer", parallel=True, engine="h5netcdf", phony_dims="access", excludeAxis=[], maxFileNum=None, **kwargs):
|
2023-04-24 13:03:23 +02:00
|
|
|
|
2023-05-07 00:38:52 +02:00
|
|
|
filePath = np.sort(np.atleast_1d(filePath))
|
|
|
|
|
|
|
|
filePathAbs = []
|
|
|
|
|
|
|
|
for i in range(len(filePath)):
|
|
|
|
filePathAbs.append(os.path.abspath(filePath[i]).replace("\\", "/"))
|
|
|
|
|
|
|
|
fullFilePath = []
|
|
|
|
for i in range(len(filePathAbs)):
|
|
|
|
fullFilePath.append(list(np.sort(glob.glob(filePathAbs[i]))))
|
|
|
|
fullFilePath = np.array(fullFilePath).flatten()
|
|
|
|
|
|
|
|
for i in range(len(fullFilePath)):
|
|
|
|
fullFilePath[i] = fullFilePath[i].replace("\\", "/")
|
2023-05-16 15:51:13 +02:00
|
|
|
|
|
|
|
if not maxFileNum is None:
|
|
|
|
fullFilePath = fullFilePath[0:int(maxFileNum)]
|
2023-05-07 00:38:52 +02:00
|
|
|
|
2023-04-24 13:03:23 +02:00
|
|
|
kwargs.update(
|
|
|
|
{
|
|
|
|
'join': join,
|
|
|
|
'parallel': parallel,
|
|
|
|
'engine': engine,
|
|
|
|
'phony_dims': phony_dims,
|
|
|
|
'group': group
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
if datesetOfGlobal is None:
|
|
|
|
datesetOfGlobal = xr.open_mfdataset(
|
2023-05-07 00:38:52 +02:00
|
|
|
fullFilePath,
|
2023-04-24 13:03:23 +02:00
|
|
|
group="globals",
|
|
|
|
concat_dim="fileNum",
|
|
|
|
combine="nested",
|
|
|
|
preprocess=_read_shot_number_from_hdf5,
|
|
|
|
engine="h5netcdf",
|
|
|
|
phony_dims="access",
|
|
|
|
combine_attrs=_read_globals_attrs,
|
|
|
|
parallel=True, )
|
|
|
|
|
2023-05-08 16:57:58 +02:00
|
|
|
datesetOfGlobal.attrs['scanAxis'] = np.setdiff1d(datesetOfGlobal.attrs['scanAxis'], excludeAxis)
|
|
|
|
|
2023-05-07 00:38:52 +02:00
|
|
|
_assgin_scan_axis = partial(_assign_scan_axis_partial, datesetOfGlobal=datesetOfGlobal, fullFilePath=fullFilePath)
|
2023-04-24 13:03:23 +02:00
|
|
|
|
|
|
|
if preprocess is None:
|
|
|
|
kwargs.update({'preprocess':_assgin_scan_axis})
|
|
|
|
else:
|
|
|
|
kwargs.update({'preprocess':preprocess})
|
|
|
|
|
2023-05-07 00:38:52 +02:00
|
|
|
ds = xr.open_mfdataset(fullFilePath, **kwargs)
|
2023-04-24 13:03:23 +02:00
|
|
|
|
|
|
|
newDimKey = np.append(['x', 'y', 'z'], [ chr(i) for i in range(97, 97+23)])
|
|
|
|
|
|
|
|
oldDimKey = np.sort(
|
|
|
|
[
|
|
|
|
key
|
|
|
|
for key in ds.dims
|
|
|
|
if not key in datesetOfGlobal.scanAxis
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
renameDict = {
|
|
|
|
oldDimKey[j]: newDimKey[j]
|
|
|
|
for j in range(len(oldDimKey))
|
|
|
|
}
|
|
|
|
|
|
|
|
ds = ds.rename_dims(renameDict)
|
|
|
|
|
|
|
|
ds.attrs = copy.deepcopy(datesetOfGlobal.attrs)
|
|
|
|
|
2023-05-18 16:09:20 +02:00
|
|
|
return ds
|
|
|
|
|
2023-05-22 19:35:09 +02:00
|
|
|
|
2023-05-18 16:09:20 +02:00
|
|
|
def _assign_scan_axis_partial_and_remove_everything(x, datesetOfGlobal, fullFilePath):
|
|
|
|
scanAxis = datesetOfGlobal.scanAxis
|
|
|
|
filePath = x.encoding["source"].replace("\\", "/")
|
|
|
|
shotNum = np.where(fullFilePath==filePath)
|
|
|
|
shotNum = np.squeeze(shotNum)
|
|
|
|
runTime = _read_run_time_from_hdf5(x)
|
2023-05-19 16:46:20 +02:00
|
|
|
x = xr.Dataset(data_vars={'runTime':runTime})
|
2023-05-18 16:09:20 +02:00
|
|
|
x = x.expand_dims(list(scanAxis))
|
|
|
|
|
|
|
|
return x.assign_coords(
|
|
|
|
{
|
|
|
|
key: np.atleast_1d(np.atleast_1d(datesetOfGlobal.attrs[key])[int(shotNum)])
|
|
|
|
for key in scanAxis
|
|
|
|
}
|
|
|
|
)
|
2023-05-23 22:14:03 +02:00
|
|
|
|
2023-05-18 16:09:20 +02:00
|
|
|
|
|
|
|
def _read_run_time_from_hdf5(x):
|
|
|
|
runTime = datetime.strptime(x.attrs['run time'], '%Y%m%dT%H%M%S')
|
|
|
|
return runTime
|
|
|
|
|
2023-05-22 19:35:09 +02:00
|
|
|
|
2023-05-18 16:09:20 +02:00
|
|
|
def read_hdf5_run_time(filePath, group=None, datesetOfGlobal=None, preprocess=None, join="outer", parallel=True, engine="h5netcdf", phony_dims="access", excludeAxis=[], maxFileNum=None, **kwargs):
|
|
|
|
|
|
|
|
filePath = np.sort(np.atleast_1d(filePath))
|
|
|
|
|
|
|
|
filePathAbs = []
|
|
|
|
|
|
|
|
for i in range(len(filePath)):
|
|
|
|
filePathAbs.append(os.path.abspath(filePath[i]).replace("\\", "/"))
|
|
|
|
|
|
|
|
fullFilePath = []
|
|
|
|
for i in range(len(filePathAbs)):
|
|
|
|
fullFilePath.append(list(np.sort(glob.glob(filePathAbs[i]))))
|
|
|
|
fullFilePath = np.array(fullFilePath).flatten()
|
|
|
|
|
|
|
|
for i in range(len(fullFilePath)):
|
|
|
|
fullFilePath[i] = fullFilePath[i].replace("\\", "/")
|
|
|
|
|
|
|
|
if not maxFileNum is None:
|
|
|
|
fullFilePath = fullFilePath[0:int(maxFileNum)]
|
|
|
|
|
|
|
|
kwargs.update(
|
|
|
|
{
|
|
|
|
'join': join,
|
|
|
|
'parallel': parallel,
|
|
|
|
'engine': engine,
|
|
|
|
'phony_dims': phony_dims,
|
|
|
|
'group': group
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
if datesetOfGlobal is None:
|
|
|
|
datesetOfGlobal = xr.open_mfdataset(
|
|
|
|
fullFilePath,
|
|
|
|
group="globals",
|
|
|
|
concat_dim="fileNum",
|
|
|
|
combine="nested",
|
|
|
|
preprocess=_read_shot_number_from_hdf5,
|
|
|
|
engine="h5netcdf",
|
|
|
|
phony_dims="access",
|
|
|
|
combine_attrs=_read_globals_attrs,
|
|
|
|
parallel=True, )
|
|
|
|
|
|
|
|
datesetOfGlobal.attrs['scanAxis'] = np.setdiff1d(datesetOfGlobal.attrs['scanAxis'], excludeAxis)
|
|
|
|
|
|
|
|
_assgin_scan_axis = partial(_assign_scan_axis_partial_and_remove_everything, datesetOfGlobal=datesetOfGlobal, fullFilePath=fullFilePath)
|
|
|
|
|
|
|
|
if preprocess is None:
|
|
|
|
kwargs.update({'preprocess':_assgin_scan_axis})
|
|
|
|
else:
|
|
|
|
kwargs.update({'preprocess':preprocess})
|
|
|
|
|
|
|
|
ds = xr.open_mfdataset(fullFilePath, **kwargs)
|
|
|
|
|
|
|
|
newDimKey = np.append(['x', 'y', 'z'], [ chr(i) for i in range(97, 97+23)])
|
|
|
|
|
|
|
|
oldDimKey = np.sort(
|
|
|
|
[
|
|
|
|
key
|
|
|
|
for key in ds.dims
|
|
|
|
if not key in datesetOfGlobal.scanAxis
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
renameDict = {
|
|
|
|
oldDimKey[j]: newDimKey[j]
|
|
|
|
for j in range(len(oldDimKey))
|
|
|
|
}
|
|
|
|
|
|
|
|
ds = ds.rename_dims(renameDict)
|
|
|
|
|
|
|
|
ds.attrs = copy.deepcopy(datesetOfGlobal.attrs)
|
|
|
|
|
|
|
|
return ds
|
|
|
|
|
2023-05-22 19:35:09 +02:00
|
|
|
|
2023-05-18 16:09:20 +02:00
|
|
|
def read_hdf5_global(filePath, preprocess=None, join="outer", combine="nested", parallel=True, engine="h5netcdf", phony_dims="access", excludeAxis=[], maxFileNum=None, **kwargs):
|
|
|
|
|
|
|
|
filePath = np.sort(np.atleast_1d(filePath))
|
|
|
|
|
|
|
|
filePathAbs = []
|
|
|
|
|
|
|
|
for i in range(len(filePath)):
|
|
|
|
filePathAbs.append(os.path.abspath(filePath[i]).replace("\\", "/"))
|
|
|
|
|
|
|
|
fullFilePath = []
|
|
|
|
for i in range(len(filePathAbs)):
|
|
|
|
fullFilePath.append(list(np.sort(glob.glob(filePathAbs[i]))))
|
|
|
|
fullFilePath = np.array(fullFilePath).flatten()
|
|
|
|
|
|
|
|
for i in range(len(fullFilePath)):
|
|
|
|
fullFilePath[i] = fullFilePath[i].replace("\\", "/")
|
|
|
|
|
|
|
|
if not maxFileNum is None:
|
|
|
|
fullFilePath = fullFilePath[0:int(maxFileNum)]
|
|
|
|
|
|
|
|
kwargs.update(
|
|
|
|
{
|
|
|
|
'join': join,
|
|
|
|
'parallel': parallel,
|
|
|
|
'engine': engine,
|
|
|
|
'phony_dims': phony_dims,
|
|
|
|
'group': "globals",
|
|
|
|
'preprocess': _read_shot_number_from_hdf5,
|
|
|
|
'combine_attrs': _read_globals_attrs,
|
|
|
|
'combine':combine,
|
|
|
|
'concat_dim': "fileNum",
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
datesetOfGlobal = xr.open_mfdataset(fullFilePath, **kwargs)
|
|
|
|
|
|
|
|
datesetOfGlobal.attrs['scanAxis'] = np.setdiff1d(datesetOfGlobal.attrs['scanAxis'], excludeAxis)
|
|
|
|
|
|
|
|
return datesetOfGlobal
|
2023-05-22 19:35:09 +02:00
|
|
|
|
|
|
|
|
|
|
|
def _read_csv_file_pandas(filePath, **kwargs):
|
|
|
|
|
|
|
|
res = pd.read_csv(filePath, **kwargs)
|
|
|
|
|
|
|
|
res = xr.Dataset.from_dataframe(res).to_array().to_numpy()
|
|
|
|
return res
|
|
|
|
|
|
|
|
|
|
|
|
def _read_csv_file_dask(filePath, **kwargs):
|
|
|
|
|
|
|
|
res = df.read_csv(filePath, **kwargs)
|
|
|
|
|
|
|
|
res = xr.Dataset.from_dataframe(res).to_array().to_numpy()
|
|
|
|
return res
|
|
|
|
|
|
|
|
|
|
|
|
def read_csv_file(filePath, maxFileNum=None, dask='parallelized', vectorize=True, csvEngine='pandas', daskKwargs={}, csvKwargs={}, **kwargs):
|
|
|
|
filePath = np.sort(np.atleast_1d(filePath))
|
|
|
|
|
|
|
|
filePathAbs = []
|
|
|
|
|
|
|
|
for i in range(len(filePath)):
|
|
|
|
filePathAbs.append(os.path.abspath(filePath[i]).replace("\\", "/"))
|
|
|
|
|
|
|
|
fullFilePath = []
|
|
|
|
for i in range(len(filePathAbs)):
|
|
|
|
fullFilePath.append(list(np.sort(glob.glob(filePathAbs[i]))))
|
|
|
|
fullFilePath = np.array(fullFilePath).flatten()
|
|
|
|
|
|
|
|
for i in range(len(fullFilePath)):
|
|
|
|
fullFilePath[i] = fullFilePath[i].replace("\\", "/")
|
|
|
|
|
|
|
|
if not maxFileNum is None:
|
|
|
|
fullFilePath = fullFilePath[0:int(maxFileNum)]
|
|
|
|
|
|
|
|
if csvEngine=='pandas':
|
|
|
|
res_first = pd.read_csv(fullFilePath[0], **csvKwargs)
|
|
|
|
elif csvEngine=='dask':
|
|
|
|
res_first = df.read_csv(fullFilePath[0], **csvKwargs)
|
|
|
|
|
|
|
|
res_first = xr.Dataset.from_dataframe(res_first)
|
|
|
|
|
|
|
|
data_vars = list(res_first.keys())
|
|
|
|
|
|
|
|
# print(data_vars)
|
|
|
|
# print(np.shape(data_vars)[1])
|
|
|
|
|
|
|
|
if len(np.shape(data_vars)) > 1:
|
|
|
|
data_vars = np.array(
|
|
|
|
[
|
|
|
|
''.join(data_vars[i])
|
|
|
|
for i in range(np.shape(data_vars)[0])
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
fullFilePath = xr.DataArray(
|
|
|
|
data=fullFilePath,
|
|
|
|
dims=['fileIndex']
|
|
|
|
)
|
|
|
|
|
|
|
|
newDimKey = np.append(['data_vars'], list(res_first.dims.keys()))
|
|
|
|
newDimKey = np.append(newDimKey, ['x', 'y', 'z'])
|
|
|
|
newDimKey = np.append(newDimKey, [ chr(i) for i in range(97, 97+23)])
|
|
|
|
|
|
|
|
kwargs.update(
|
|
|
|
{
|
|
|
|
'dask': dask,
|
|
|
|
'vectorize': vectorize,
|
|
|
|
'output_core_dims': [newDimKey[0:len(res_first.dims) + 1]],
|
|
|
|
"dask_gufunc_kwargs": daskKwargs,
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
|
|
|
if csvEngine=='pandas':
|
|
|
|
res = xr.apply_ufunc(_read_csv_file_pandas, fullFilePath, kwargs=csvKwargs, **kwargs)
|
|
|
|
elif csvEngine=='dask':
|
|
|
|
res = xr.apply_ufunc(_read_csv_file_dask, fullFilePath, kwargs=csvKwargs, **kwargs)
|
|
|
|
|
|
|
|
res = res.assign_coords({'data_vars': data_vars})
|
|
|
|
|
|
|
|
res = res.to_dataset(dim='data_vars')
|
|
|
|
|
|
|
|
for key in list(res_first.coords.keys()):
|
|
|
|
res = res.assign_coords({key: res_first[key]})
|
|
|
|
|
|
|
|
return res
|