Coverage for src / mafw / processor_library / importer.py: 100%
125 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-09 09:08 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-09 09:08 +0000
1# Copyright 2025 European Union
2# Author: Bulgheroni Antonio (antonio.bulgheroni@ec.europa.eu)
3# SPDX-License-Identifier: EUPL-1.2
4"""
5Provides a basic element importer.
7The first step in the setting up of the analytical framework of a data analysis procedure is to add new elements to
8the input set.
9These elements can encompass a wide range of data, including results from experiments or simulations, as well as information
10gathered through from webscraping or other data sources.
12Independently of where the data are coming from, one common task is to add those data to your collection inside the
13DB, so that the following analytical steps know where the data are and what they are.
15This module provides a generic processor that the user can subclass and customize to their needs to import
16input files. Thanks to a smart filename parsing, other information can be extracted from the filename itself and
17used to populate additional columns in the dedicated database table.
19"""
21from __future__ import annotations
23import logging
24import re
25import tomllib
26from pathlib import Path
27from typing import TYPE_CHECKING, Any
29from mafw.mafw_errors import MissingAttribute, ParserConfigurationError, ParsingError
30from mafw.processor import ActiveParameter, Processor
32log = logging.getLogger(__name__)
35class FilenameElement:
36 """
37 Helper class for the definition of filename element.
39 While importing an element to the DB, several parameters can be retrieved directly from the filename. The role of
40 this helper class is to provide an easy way to define patterns in the filename representing a specific piece of
41 information that has to be transferred to the DB.
43 The element is characterized by a name, a regular expression, the expected python type for the parsed value and an optional
44 default value. The regular expression should contain a named group in the form ``?P<name>`` where name is matching
45 the FilenameElement name.
47 To make a filename element optional, it is enough to provide a default value different from None.
48 In this case, if the parsing is failing, then the default value will be returned.
49 """
51 type_lut: dict[str, type[str] | type[int] | type[float]] = {'str': str, 'int': int, 'float': float}
52 """A lookup table for converting type definition as string into python types"""
54 def __init__(
55 self,
56 name: str,
57 regex: str | re.Pattern[str],
58 value_type: type = str,
59 default_value: str | int | float | None = None,
60 ) -> None:
61 """
62 Constructor parameters:
64 :param name: The name of the filename element
65 :type name: str
66 :param regex: The regular expression associated to this filename element. It must contain a named group in the
67 form ?P<name>.
68 :type regex: str | re.Pattern[str]
69 :param value_type: The type the output value should be converted into. It defaults to str.
70 :type value_type: type, Optional
71 :param default_value: The default value to assign to the filename element if the pattern is not found in the
72 filename. It defaults to None
73 :type default_value: Any, Optional
74 """
75 self._variable_name: str = name
76 if not isinstance(regex, re.Pattern):
77 regex = re.compile(regex)
79 self._regex: re.Pattern[str] = regex
80 self._value_type = value_type
81 self._default_value = default_value
82 self._value = default_value
83 self._validate_default_type()
84 self._validate_regexp()
86 def _validate_regexp(self) -> None:
87 """
88 Checks if the regular expression contains a named group named after the element itself.
90 :raise ValueError: if the regular expression is not valid.
91 """
92 pattern = self._regex.pattern
93 group = rf'?P<{self._variable_name}>'
94 if group not in pattern:
95 raise ValueError('Attempt to create a FilenameElement with a regular expression without a named group.')
97 def _validate_default_type(self) -> None:
98 """
99 Checks that the default has a type matching the value type. The check is actually performed if and only if a
100 default value is provided. If None, then the validation is skipped.
102 :raise TypeError: if the default value type does not match the declared value type.
103 """
104 if type(self._default_value) is not self._value_type and self._default_value is not None:
105 raise TypeError(
106 f'The type of the default value ({str(type(self._default_value))}) is not matching the '
107 f'declared value type ({str(self._value_type)})'
108 )
110 @classmethod
111 def _get_value_type(cls, type_as_string: str) -> type:
112 """
113 Returns the value type.
115 This method is used by the class method constructor to check if the user provided type in the form of a
116 string is a valid one.
118 If so, then the corresponding python type is returned, otherwise a ValueError exception is raised.
120 :param type_as_string: The type of the value as a string.
121 :type type_as_string: str
122 :return: The corresponding python type.
123 :rtype: type
124 :raise ValueError: if type_as_string is not any of the acceptable type for the value.
125 """
126 if type_as_string not in cls.type_lut:
127 raise ValueError('Attempt to create a FilenameElement with a not available value type')
129 return cls.type_lut[type_as_string]
131 def reset(self) -> None:
132 """
133 Resets the value to the default value.
135 **Remember:** that the default value is None for compulsory elements.
136 """
137 self._value = self._default_value
139 @classmethod
140 def from_dict(cls, name: str, info_dict: dict[str, str | int | float]) -> FilenameElement:
141 """
142 Generates a FilenameElement starting from external information stored in a dictionary.
144 `info_dict` should contain the following three keys:
145 - regexp: the Regular expression for the element search.
146 - type: a string with the python type name (int, float, str) for the element conversion.
147 - default (*optional*): a default value.
149 :param name: The name of the element.
150 :type name: str
151 :param info_dict: The dictionary with the required parameters for the class constructor.
152 :type info_dict: dict
153 :return: An instance of FilenameElement.
154 :rtype: FilenameElement
155 """
156 # get the regexp
157 try:
158 regexp = info_dict['regexp']
159 except KeyError:
160 log.critical('Attempt to create a FilenameElement without a regular expression')
161 raise
162 # now let's check that the type of the regexp is acceptable
163 if not isinstance(regexp, str):
164 raise TypeError('Problem with regexp')
166 value_type_str = info_dict.get('type', 'str')
167 # check that this is a string
168 if not isinstance(value_type_str, str):
169 raise ValueError('Attempt to create a FilenameElement with a wrong value type.')
171 value_type = cls._get_value_type(value_type_str)
173 return cls(name, regex=regexp, value_type=value_type, default_value=info_dict.get('default', None))
175 @property
176 def name(self) -> str:
177 """Returns the class name"""
178 return self._variable_name
180 @property
181 def value(self) -> str | int | float | None:
182 """Returns the class value"""
183 return self._value
185 @property
186 def is_optional(self) -> bool:
187 """Returns if the element is optional"""
188 return self._default_value is not None
190 @property
191 def is_found(self) -> bool:
192 """Returns if the file element is found"""
193 if self.is_optional:
194 return True
195 else:
196 return self._value != self._default_value
198 @property
199 def pattern(self) -> str | bytes:
200 """Returns the regular expression pattern"""
201 return self._regex.pattern
203 def search(self, string: str | Path) -> None:
204 """
205 Searches the string for the regular expression.
207 If the pattern is found in the string, then the matched value is transferred to the FilenameElement value.
209 .. note::
211 This method is not returning the match value. It is only searching the input string for the
212 registered pattern. If the pattern is found, then the user can retrieve the matched value by invoking the
213 :meth:`.value` method. If the pattern is not found, the :meth:`.value` will return either None,
214 for a compulsory element, or the default value for an optional one.
216 :param string: The string to be parsed. In most of the case, this is a filename, that is why the method is
217 accepting also a Path type.
218 :type string: str | Path
219 """
220 self.reset()
221 if isinstance(string, Path):
222 string = str(string)
223 result = re.search(self._regex, string)
224 if result:
225 self._value = self._value_type(result[self._variable_name])
228class FilenameParser:
229 r"""
230 Helper class to interpret all elements in a filename.
232 Inside a filename, there might be many elements containing information about the item that must be stored in the DB.
233 This class will parse the filename, and after a successful identification of them all, it will make them available
234 for the importer class to fill in the fields in the database.
236 The :class:`~FilenameParser` needs to be configured to be able to recognise each element in the filename.
237 Such configuration is saved in a `toml` file.
238 An example of such a configuration is provided :download:`here </_static/toml_files/filename_parser_conf.toml>`.
240 Each element must start with its name and a valid regular expression and a python type (in string).
241 If an element is optional, then a default value must be provided as well.
243 After the configuration, the filename can be interpreted invoking the :meth:`~interpret` method.
244 This will perform the actual parsing of the filename.
245 If an error occurs during the parsing process, meaning that a compulsory element is not found, then the
246 :class:`~.ParsingError` exception will be raised.
247 So remember to protect the interpretation with a try/except block.
249 The value of each file element is available upon request.
250 The user has simply to invoke the :meth:`~get_element_value` providing the element name.
251 """
253 def __init__(self, configuration_file: str | Path, filename: str | Path | None = None) -> None:
254 """
255 Constructor parameters:
257 :param filename: The filename to be interpreted.
258 :type filename: str | Path
259 :param configuration_file: The configuration file for the interpreter.
260 :type configuration_file: str | Path
261 :raise ParserConfigurationError: If the configuration file is invalid.
262 """
264 #: The filename for this interpreter. If None, it should be specified before interpretation.
265 self._filename = str(filename) if filename is not None else None
266 #: The configuration file for the interpreter.
267 self._configuration_file = configuration_file
268 #: A dictionary with all the FilenameElement
269 self._element_dict: dict[str, FilenameElement] = {}
271 self._parser_configuration()
273 def _parser_configuration(self) -> None:
274 """
275 Loads the parser configuration, generates the required FilenameElement and adds them element dictionary.
277 The configuration file is stored in a TOML file.
279 This private method is automatically invoked by the class constructor.
281 :raise ParserConfigurationError: if the provided configuration file is invalid.
282 """
283 with open(self._configuration_file, 'rb') as f:
284 config = tomllib.load(f)
286 for element in config['elements']:
287 if element not in config:
288 raise ParserConfigurationError(f'Missing {element} table.')
290 self._element_dict[element] = FilenameElement.from_dict(element, config[element])
292 @property
293 def elements(self) -> dict[str, FilenameElement]:
294 """Returns the filename element dictionary"""
295 return self._element_dict
297 def interpret(self, filename: str | Path | None = None) -> None:
298 """
299 Performs the interpretation of the filename.
301 The filename can be provided either as constructor argument or here as an argument. If both, then the local
302 one will have the precedence.
304 :raises ParsingError: if a compulsory element is not found in the filename
305 :raises MissingAttribute: if no filename has been specified.
306 """
307 if self._filename is None and filename is None:
308 raise MissingAttribute('Missing filename')
310 if filename:
311 self.reset()
312 self._filename = str(filename)
314 if TYPE_CHECKING:
315 # self._filename is either set by the constructor or by the interpret method
316 # at this stage it cannot be None
317 assert self._filename is not None
319 for element in self._element_dict.values():
320 element.search(self._filename)
321 if not element.is_found:
322 raise ParsingError(f'Missing {element.name}')
324 def get_element(self, element_name: str) -> FilenameElement | None:
325 """Gets the FilenameElement named element_name"""
326 if element_name in self._element_dict:
327 return self._element_dict[element_name]
328 else:
329 return None
331 def get_element_value(self, element_name: str) -> str | int | float | None:
332 """
333 Gets the value of the FilenameElement named element_name.
335 It is equivalent to call ``self.get_element('element_name').value``
336 """
337 if element_name in self._element_dict:
338 return self._element_dict[element_name].value
339 else:
340 return None
342 def reset(self) -> None:
343 """Resets all filename elements"""
344 for element in self._element_dict.values():
345 element.reset()
348class Importer(Processor):
349 """
350 Importer is the base class for importing elements in the Database structure.
352 It provides an easy skeleton to be subclassed by a more specific importer related to a certain project.
354 It can be customised with three processor parameters:
356 * The ``parser_configuration``: the path to the configuration file for the :class:`~.FilenameParser`.
357 * The ``input_folder``: the path where the input files to be imported are.
358 * The ``recursive`` flag: to specify if all subfolders should be also scanned.
360 For a concrete implementation, have a look at the :class:`~.ImporterExample` from the example library.
361 """
363 parser_configuration = ActiveParameter(
364 'parser_configuration',
365 default='parser_configuration.toml',
366 help_doc='The path to the TOML file with the filename parser configuration ',
367 )
368 input_folder = ActiveParameter(
369 'input_folder', default=str(Path.cwd()), help_doc='The input folder from where the images have to be imported.'
370 )
371 recursive = ActiveParameter('recursive', default=True, help_doc='Extend the search to sub-folder')
373 def __init__(self, *args: Any, **kwargs: Any):
374 super().__init__(*args, **kwargs)
375 self._filename_parser: FilenameParser
376 """The filename parser instance"""
378 def format_progress_message(self) -> None:
379 self.progress_message = f'[cyan]Importing element {self.i_item + 1} of {self.n_item}'
381 def start(self) -> None:
382 """
383 The start method.
385 The filename parser is created using the provided configuration file.
387 :raise ParserConfigurationError: If the configuration file is not valid.
388 """
389 super().start()
390 if TYPE_CHECKING:
391 assert isinstance(self.parser_configuration, str)
393 self._filename_parser = FilenameParser(self.parser_configuration)