major update on files.py:

Tom-Notch · Tom-Notch · commit b0232dbf16f9 · 2024-11-10T22:25:07.000-05:00
- support json toml csv
- decouple cv2 from yaml support by using customized yaml constructor
- better file type check with python-magic (for image file type)
- file cache to prevent infinite recursive read on the same file
diff --git a/README.md b/README.md
@@ -2,8 +2,6 @@
 
 This repository is a ROS library-only package that provides file utilities.
 
-Pre-built docker image contains all the dependencies and can be used as dev/deploy image, it is multi-arch and can run on both x86 and ARM devices.
-
 ## Dependencies
 
 System environment needs to have the following environment:
diff --git a/src/file_utils/files.py b/src/file_utils/files.py
@@ -7,38 +7,51 @@
 #
 # Copyright Ⓒ 2023 Mukai (Tom Notch) Yu, Yao He
 #
+import csv
+import json
 import os
+import warnings
 
 import cv2
+import magic
 import numpy as np
+import toml
+import yaml
 
 # import torch
 # import torch_tensorrt
 
 
-# loaded_models = []  # list of loaded models for one total program run
+mime = magic.Magic(mime=True, uncompress=True)
 
+file_cache = {}  # cache for files that are read
 
-# def load_model(model_path: str) -> torch.nn.Module:
-#     """ensure that only one instance of the model is loaded, later loadings will point to the same model instance loaded before
 
-#     Args:
-#         model_path (str): path to the model
+def opencv_matrix_constructor(loader, node):
+    """Custom constructor for !!opencv-matrix tag."""
 
-#     Returns:
-#         torch.nn.Module: model
-#     """
-#     model = torch.jit.load(model_path)
+    # Parse the node as a dictionary
+    matrix_data = loader.construct_mapping(node, deep=True)
 
-#     for loaded_model in loaded_models:
-#         if (
-#             model.state_dict() == loaded_model.state_dict()
-#         ):  # if the model is already loaded
-#             del model  # delete the duplicate model
-#             return loaded_model  # return the loaded model
+    # Extract rows, cols, dt, and data
+    rows = matrix_data["rows"]
+    cols = matrix_data["cols"]
+    dt = matrix_data["dt"]
+    data = matrix_data["data"]
 
-#     loaded_models.append(model)  # model's a new model, add it to the list
-#     return model
+    # Map OpenCV data types to NumPy data types
+    dtype_map = {"u": np.uint8, "i": np.int32, "f": np.float32, "d": np.float64}
+
+    # Determine the NumPy data type
+    dtype = dtype_map.get(dt, np.float64)
+
+    # Convert data to a NumPy array and reshape
+    matrix = np.array(data, dtype=dtype).reshape((rows, cols))
+
+    return matrix
+
+
+yaml.add_constructor("tag:yaml.org,2002:opencv-matrix", opencv_matrix_constructor)
 
 
 def print_dict(d: dict, indent: int = 0) -> None:
@@ -77,53 +90,41 @@ def parse_path(probe_path: str, base_path: str = None):
     if base_path is None:
         base_path = os.getcwd()
     if os.path.isabs(expand_path) and os.path.exists(expand_path):
-        return expand_path
+        return os.path.realpath(expand_path)
     elif os.path.exists(os.path.join(base_path, probe_path)):
-        return os.path.join(base_path, probe_path)
+        return os.path.realpath(os.path.join(base_path, probe_path))
     else:
         return False
 
 
-def get_item(node: cv2.FileNode, yaml_base_path: str):
-    """get an item from a cv2.FileNode, recursively parse the item if it is a Map, List or path
+def parse_content(node, yaml_base_path: str):
+    """recursively look into the leaf node of a yaml file, if the leaf node is a string, try to parse it as a path and read the file, could be an image or nested yaml config
 
     Args:
-        node (cv2.FileNode): file node to be parsed
+        node: file node to be parsed
         yaml_base_path (str): the base path of the current yaml file
 
     Returns:
         the read content
     """
-    if node.isNone():  # empty
-        return None
-    elif node.isMap():  # dict
-        keys = node.keys()
-        if all(mat_key in keys for mat_key in ["rows", "cols", "dt", "data"]):  # matrix
-            return node.mat()
-        else:  # key-value pairs
-            dict = {}
-            for key in keys:
-                dict[key] = get_item(node.getNode(key), yaml_base_path)
-            return dict
-    elif node.isSeq():  # list
-        list = []
-        for i in range(node.size()):
-            list.append(get_item(node.at(i), yaml_base_path))
-        return list
-    elif node.isReal() or node.isInt():  # number
-        return node.real()
-    elif node.isString():  # string
-        path = parse_path(
-            node.string(), yaml_base_path
-        )  # try parsing the string as path
+
+    if isinstance(node, dict):
+        for key, value in node.items():
+            node[key] = parse_content(value, yaml_base_path)
+    elif isinstance(node, list):
+        for index, value in enumerate(node):
+            node[index] = parse_content(value, yaml_base_path)
+    elif isinstance(node, str):
+        path = parse_path(node, yaml_base_path)
         if path:
-            return read_file(path)
-        else:  # not a path
-            return node.string()
+            node = read_file(path)
+
+    return node
 
 
 def read_file(path: str):
-    """test the path, read a file, if it is a yaml file; parse it, if it is an image, read it; if it is a torchscript module, return the path; otherwise raise exception for file not supported.
+    """test the path, read a file.
+       supports multiple file type and interleaving config file types.
 
     Args:
         path (str): path to the file, can be absolute or relative
@@ -133,12 +134,44 @@ def read_file(path: str):
     """
     if not os.path.exists(path):
         raise FileNotFoundError(f"File {path} does not exist")
-    if path.endswith(".yaml") or path.endswith(".yml"):
-        yaml_file = cv2.FileStorage(path, cv2.FILE_STORAGE_READ)
-        return get_item(yaml_file.root(), os.path.dirname(path))
-    elif path.endswith(".png") or path.endswith(".jpg") or path.endswith(".jpeg"):
-        return cv2.imread(path)
-    # elif path.endswith(".ts") or path.endswith(".trt"):
-    #     return load_model(path)
+
+    if path in file_cache.keys():
+        return file_cache[path]
+
+    # determine file type with python-magic, useful for massive image types
+    mime_type = mime.from_file(path)
+
+    file_base_path = os.path.dirname(path)
+
+    if mime_type in {
+        "application/yaml",
+        "application/x-yaml",
+        "text/yaml",
+    } or path.endswith((".yaml", ".yml")):
+        with open(path, "r") as f:
+            parsed_yaml = yaml.load(f.read(), Loader=yaml.FullLoader)
+        parsed_content = parse_content(parsed_yaml, file_base_path)
+    elif mime_type.startswith("image/"):
+        parsed_content = cv2.imread(path)
+    elif mime_type in {"text/csv", "application/csv"} or path.endswith(".csv"):
+        with open(path, "r") as f:
+            reader = csv.reader(f)
+            parsed_content = list(reader)
+    elif mime_type in {"application/json", "text/json"} or path.endswith(".json"):
+        with open(path, "r") as f:
+            parsed_content = parse_content(json.load(f), file_base_path)
+    elif mime_type in {"application/toml", "text/toml"} or path.endswith(".toml"):
+        parsed_content = parse_content(toml.load(path), file_base_path)
+    elif path.endswith(".npy"):
+        try:
+            parsed_content = np.load(path, allow_pickle=True)
+        except Exception as e:
+            raise RuntimeError(f"Error loading NumPy array from {path}: {e}")
+    # elif mime_type in {'application/x-torchscript', 'application/x-tensorrt'} or path.endswith((".ts", ".trt")):
+    #     parsed_content = torch.jit.load(model_path)
     else:
-        raise Exception(f"File {path} is not supported")
+        warnings.warn(f"File {path} is not supported, reading as string")
+        parsed_content = path
+
+    file_cache[path] = parsed_content
+    return parsed_content