From 43106834a5f22733d3a4a189fc73255a77260cce Mon Sep 17 00:00:00 2001 From: Gao Date: Mon, 22 May 2023 19:35:09 +0200 Subject: [PATCH] implement reading from csv --- Analyser/FitAnalyser.py | 2 +- DataContainer/ReadData.py | 94 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 94 insertions(+), 2 deletions(-) diff --git a/Analyser/FitAnalyser.py b/Analyser/FitAnalyser.py index cc12f87..4fd7591 100644 --- a/Analyser/FitAnalyser.py +++ b/Analyser/FitAnalyser.py @@ -284,7 +284,7 @@ class FitAnalyser(): self.fitDim = fitDim - def print_params_set_templat(self, params=None): + def print_params_set_template(self, params=None): if params is None: params = self.fitModel.make_params() diff --git a/DataContainer/ReadData.py b/DataContainer/ReadData.py index 799bc95..6928cb8 100644 --- a/DataContainer/ReadData.py +++ b/DataContainer/ReadData.py @@ -1,4 +1,6 @@ import xarray as xr +import dask.dataframe as df +import pandas as pd import numpy as np from collections import OrderedDict from functools import partial @@ -188,6 +190,7 @@ def read_hdf5_file(filePath, group=None, datesetOfGlobal=None, preprocess=None, return ds + def _assign_scan_axis_partial_and_remove_everything(x, datesetOfGlobal, fullFilePath): scanAxis = datesetOfGlobal.scanAxis filePath = x.encoding["source"].replace("\\", "/") @@ -208,6 +211,7 @@ def _read_run_time_from_hdf5(x): runTime = datetime.strptime(x.attrs['run time'], '%Y%m%dT%H%M%S') return runTime + def read_hdf5_run_time(filePath, group=None, datesetOfGlobal=None, preprocess=None, join="outer", parallel=True, engine="h5netcdf", phony_dims="access", excludeAxis=[], maxFileNum=None, **kwargs): filePath = np.sort(np.atleast_1d(filePath)) @@ -282,6 +286,7 @@ def read_hdf5_run_time(filePath, group=None, datesetOfGlobal=None, preprocess=No return ds + def read_hdf5_global(filePath, preprocess=None, join="outer", combine="nested", parallel=True, engine="h5netcdf", phony_dims="access", excludeAxis=[], maxFileNum=None, **kwargs): filePath = np.sort(np.atleast_1d(filePath)) @@ -321,4 +326,91 @@ def read_hdf5_global(filePath, preprocess=None, join="outer", combine="nested", datesetOfGlobal.attrs['scanAxis'] = np.setdiff1d(datesetOfGlobal.attrs['scanAxis'], excludeAxis) return datesetOfGlobal - \ No newline at end of file + + +def _read_csv_file_pandas(filePath, **kwargs): + + res = pd.read_csv(filePath, **kwargs) + + res = xr.Dataset.from_dataframe(res).to_array().to_numpy() + return res + + +def _read_csv_file_dask(filePath, **kwargs): + + res = df.read_csv(filePath, **kwargs) + + res = xr.Dataset.from_dataframe(res).to_array().to_numpy() + return res + + +def read_csv_file(filePath, maxFileNum=None, dask='parallelized', vectorize=True, csvEngine='pandas', daskKwargs={}, csvKwargs={}, **kwargs): + filePath = np.sort(np.atleast_1d(filePath)) + + filePathAbs = [] + + for i in range(len(filePath)): + filePathAbs.append(os.path.abspath(filePath[i]).replace("\\", "/")) + + fullFilePath = [] + for i in range(len(filePathAbs)): + fullFilePath.append(list(np.sort(glob.glob(filePathAbs[i])))) + fullFilePath = np.array(fullFilePath).flatten() + + for i in range(len(fullFilePath)): + fullFilePath[i] = fullFilePath[i].replace("\\", "/") + + if not maxFileNum is None: + fullFilePath = fullFilePath[0:int(maxFileNum)] + + if csvEngine=='pandas': + res_first = pd.read_csv(fullFilePath[0], **csvKwargs) + elif csvEngine=='dask': + res_first = df.read_csv(fullFilePath[0], **csvKwargs) + + res_first = xr.Dataset.from_dataframe(res_first) + + data_vars = list(res_first.keys()) + + # print(data_vars) + # print(np.shape(data_vars)[1]) + + if len(np.shape(data_vars)) > 1: + data_vars = np.array( + [ + ''.join(data_vars[i]) + for i in range(np.shape(data_vars)[0]) + ] + ) + + fullFilePath = xr.DataArray( + data=fullFilePath, + dims=['fileIndex'] + ) + + newDimKey = np.append(['data_vars'], list(res_first.dims.keys())) + newDimKey = np.append(newDimKey, ['x', 'y', 'z']) + newDimKey = np.append(newDimKey, [ chr(i) for i in range(97, 97+23)]) + + kwargs.update( + { + 'dask': dask, + 'vectorize': vectorize, + 'output_core_dims': [newDimKey[0:len(res_first.dims) + 1]], + "dask_gufunc_kwargs": daskKwargs, + } + ) + + if csvEngine=='pandas': + res = xr.apply_ufunc(_read_csv_file_pandas, fullFilePath, kwargs=csvKwargs, **kwargs) + elif csvEngine=='dask': + res = xr.apply_ufunc(_read_csv_file_dask, fullFilePath, kwargs=csvKwargs, **kwargs) + + res = res.assign_coords({'data_vars': data_vars}) + + res = res.to_dataset(dim='data_vars') + + for key in list(res_first.coords.keys()): + res = res.assign_coords({key: res_first[key]}) + + return res \ No newline at end of file