analyseScript/DataContainer/ReadData.py

import xarray as xr
import numpy as np
from collections import OrderedDict
from functools import partial
import copy
import glob
import os
from datetime import datetime


def _read_globals_attrs(variable_attrs, context=None):
    """Combine attributes from different variables according to combine_attrs"""
    if not variable_attrs:
        # no attributes to merge
        return None

    from xarray.core.utils import equivalent

    result = {}
    dropped_attrs = OrderedDict()
    for attrs in variable_attrs:
        result.update(
            {
                key: value
                for key, value in attrs.items()
                if key not in result and key not in dropped_attrs.keys()
            }
        )
        result = {
            key: value
            for key, value in result.items()
            if key not in attrs or equivalent(attrs[key], value)
        }
        dropped_attrs.update(
            {
                key: []
                for key in attrs if key not in result 
            }
        )

    for attrs in variable_attrs:
        dropped_attrs.update(
            {
                key: np.append(dropped_attrs[key], attrs[key])
                for key in dropped_attrs.keys()
            }
        )        
    
    scan_attrs = OrderedDict()
    scan_length = []
    for attrs_key in dropped_attrs.keys():
        flag = True
        for key in scan_attrs.keys():
            if equivalent(scan_attrs[key], dropped_attrs[attrs_key]):
                flag = False
                
                result.update({attrs_key: key})

                break
        if flag:
            scan_attrs.update({
                attrs_key: dropped_attrs[attrs_key]
            })
            scan_length = np.append(scan_length, len(dropped_attrs[attrs_key]))

    result.update(
        {
            key: value
            for key, value in scan_attrs.items()
        }
    )

    result.update(
        {
            "scanAxis": list(scan_attrs.keys()),
            "scanAxisLength": scan_length,
        }
    )

    # if result['scanAxis'] == []:
    #     result['scanAxis'] = ['runs',]

    return result


def _read_shot_number_from_hdf5(x):
    filePath = x.encoding["source"]
    shotNum = filePath.split("_")[-1].split("_")[-1].split(".")[0]
    return x.assign(shotNum=shotNum)


def _assign_scan_axis_partial(x, datesetOfGlobal, fullFilePath):
    scanAxis = datesetOfGlobal.scanAxis
    filePath = x.encoding["source"].replace("\\", "/")
    shotNum = np.where(fullFilePath==filePath)
    shotNum = np.squeeze(shotNum)
    # shotNum = filePath.split("_")[-1].split("_")[-1].split(".")[0]
    x = x.assign(shotNum=shotNum)
    x = x.expand_dims(list(scanAxis))

    return x.assign_coords(
        {
            key: np.atleast_1d(np.atleast_1d(datesetOfGlobal.attrs[key])[int(shotNum)])
            for key in scanAxis
        }
    )


def _update_globals_attrs(variable_attrs, context=None):
    pass


def update_hdf5_file():
    pass
 

def read_hdf5_file(filePath, group=None, datesetOfGlobal=None, preprocess=None, join="outer", parallel=True, engine="h5netcdf", phony_dims="access", excludeAxis=[], maxFileNum=None, **kwargs):

    filePath = np.sort(np.atleast_1d(filePath))

    filePathAbs = []

    for i in range(len(filePath)):
        filePathAbs.append(os.path.abspath(filePath[i]).replace("\\", "/"))

    fullFilePath = []
    for i in range(len(filePathAbs)):
        fullFilePath.append(list(np.sort(glob.glob(filePathAbs[i]))))
    fullFilePath = np.array(fullFilePath).flatten()

    for i in range(len(fullFilePath)):
        fullFilePath[i] = fullFilePath[i].replace("\\", "/")
        
    if not maxFileNum is None:
        fullFilePath = fullFilePath[0:int(maxFileNum)]

    kwargs.update(
        {
            'join': join, 
            'parallel': parallel,
            'engine': engine,
            'phony_dims': phony_dims,
            'group': group
        }
    )

    if datesetOfGlobal is None:
        datesetOfGlobal = xr.open_mfdataset(
            fullFilePath, 
            group="globals", 
            concat_dim="fileNum", 
            combine="nested", 
            preprocess=_read_shot_number_from_hdf5, 
            engine="h5netcdf", 
            phony_dims="access", 
            combine_attrs=_read_globals_attrs,
            parallel=True, )
    
    datesetOfGlobal.attrs['scanAxis'] = np.setdiff1d(datesetOfGlobal.attrs['scanAxis'], excludeAxis)
    
    _assgin_scan_axis = partial(_assign_scan_axis_partial, datesetOfGlobal=datesetOfGlobal, fullFilePath=fullFilePath)

    if preprocess is None:
        kwargs.update({'preprocess':_assgin_scan_axis})
    else:
        kwargs.update({'preprocess':preprocess})

    ds = xr.open_mfdataset(fullFilePath, **kwargs)

    newDimKey = np.append(['x', 'y', 'z'], [ chr(i) for i in range(97, 97+23)])

    oldDimKey = np.sort(
        [
            key 
            for key in ds.dims
            if not key in datesetOfGlobal.scanAxis
        ]
    )

    renameDict = {
        oldDimKey[j]: newDimKey[j]
        for j in range(len(oldDimKey))
    }

    ds = ds.rename_dims(renameDict)

    ds.attrs = copy.deepcopy(datesetOfGlobal.attrs)
        
    return ds

def _assign_scan_axis_partial_and_remove_everything(x, datesetOfGlobal, fullFilePath):
    scanAxis = datesetOfGlobal.scanAxis
    filePath = x.encoding["source"].replace("\\", "/")
    shotNum = np.where(fullFilePath==filePath)
    shotNum = np.squeeze(shotNum)
    runTime = _read_run_time_from_hdf5(x)
    x = xr.Dataset(data_vars={'runTine':runTime})
    x = x.expand_dims(list(scanAxis))

    return x.assign_coords(
        {
            key: np.atleast_1d(np.atleast_1d(datesetOfGlobal.attrs[key])[int(shotNum)])
            for key in scanAxis
        }
    )
    
def _read_run_time_from_hdf5(x):
    runTime = datetime.strptime(x.attrs['run time'], '%Y%m%dT%H%M%S')
    return runTime

def read_hdf5_run_time(filePath, group=None, datesetOfGlobal=None, preprocess=None, join="outer", parallel=True, engine="h5netcdf", phony_dims="access", excludeAxis=[], maxFileNum=None, **kwargs):

    filePath = np.sort(np.atleast_1d(filePath))

    filePathAbs = []

    for i in range(len(filePath)):
        filePathAbs.append(os.path.abspath(filePath[i]).replace("\\", "/"))

    fullFilePath = []
    for i in range(len(filePathAbs)):
        fullFilePath.append(list(np.sort(glob.glob(filePathAbs[i]))))
    fullFilePath = np.array(fullFilePath).flatten()

    for i in range(len(fullFilePath)):
        fullFilePath[i] = fullFilePath[i].replace("\\", "/")
        
    if not maxFileNum is None:
        fullFilePath = fullFilePath[0:int(maxFileNum)]

    kwargs.update(
        {
            'join': join, 
            'parallel': parallel,
            'engine': engine,
            'phony_dims': phony_dims,
            'group': group
        }
    )
    
    if datesetOfGlobal is None:
        datesetOfGlobal = xr.open_mfdataset(
            fullFilePath, 
            group="globals", 
            concat_dim="fileNum", 
            combine="nested", 
            preprocess=_read_shot_number_from_hdf5, 
            engine="h5netcdf", 
            phony_dims="access", 
            combine_attrs=_read_globals_attrs,
            parallel=True, )
    
    datesetOfGlobal.attrs['scanAxis'] = np.setdiff1d(datesetOfGlobal.attrs['scanAxis'], excludeAxis)
    
    _assgin_scan_axis = partial(_assign_scan_axis_partial_and_remove_everything, datesetOfGlobal=datesetOfGlobal, fullFilePath=fullFilePath)

    if preprocess is None:
        kwargs.update({'preprocess':_assgin_scan_axis})
    else:
        kwargs.update({'preprocess':preprocess})

    ds = xr.open_mfdataset(fullFilePath, **kwargs)

    newDimKey = np.append(['x', 'y', 'z'], [ chr(i) for i in range(97, 97+23)])

    oldDimKey = np.sort(
        [
            key 
            for key in ds.dims
            if not key in datesetOfGlobal.scanAxis
        ]
    )

    renameDict = {
        oldDimKey[j]: newDimKey[j]
        for j in range(len(oldDimKey))
    }

    ds = ds.rename_dims(renameDict)

    ds.attrs = copy.deepcopy(datesetOfGlobal.attrs)
        
    return ds

def read_hdf5_global(filePath, preprocess=None, join="outer", combine="nested", parallel=True, engine="h5netcdf", phony_dims="access", excludeAxis=[], maxFileNum=None, **kwargs):
    
    filePath = np.sort(np.atleast_1d(filePath))

    filePathAbs = []

    for i in range(len(filePath)):
        filePathAbs.append(os.path.abspath(filePath[i]).replace("\\", "/"))

    fullFilePath = []
    for i in range(len(filePathAbs)):
        fullFilePath.append(list(np.sort(glob.glob(filePathAbs[i]))))
    fullFilePath = np.array(fullFilePath).flatten()

    for i in range(len(fullFilePath)):
        fullFilePath[i] = fullFilePath[i].replace("\\", "/")
        
    if not maxFileNum is None:
        fullFilePath = fullFilePath[0:int(maxFileNum)]

    kwargs.update(
        {
            'join': join, 
            'parallel': parallel,
            'engine': engine,
            'phony_dims': phony_dims,
            'group': "globals",
            'preprocess': _read_shot_number_from_hdf5,
            'combine_attrs': _read_globals_attrs, 
            'combine':combine,
            'concat_dim': "fileNum",
        }
    )
    
    datesetOfGlobal = xr.open_mfdataset(fullFilePath, **kwargs)
    
    datesetOfGlobal.attrs['scanAxis'] = np.setdiff1d(datesetOfGlobal.attrs['scanAxis'], excludeAxis)
        
    return datesetOfGlobal