Coverage for src / mafw / processor_library / importer.py: 100%

125 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-09 09:08 +0000

1# Copyright 2025 European Union 

2# Author: Bulgheroni Antonio (antonio.bulgheroni@ec.europa.eu) 

3# SPDX-License-Identifier: EUPL-1.2 

4""" 

5Provides a basic element importer. 

6 

7The first step in the setting up of the analytical framework of a data analysis procedure is to add new elements to 

8the input set. 

9These elements can encompass a wide range of data, including results from experiments or simulations, as well as information 

10gathered through from webscraping or other data sources. 

11 

12Independently of where the data are coming from, one common task is to add those data to your collection inside the 

13DB, so that the following analytical steps know where the data are and what they are. 

14 

15This module provides a generic processor that the user can subclass and customize to their needs to import 

16input files. Thanks to a smart filename parsing, other information can be extracted from the filename itself and 

17used to populate additional columns in the dedicated database table. 

18 

19""" 

20 

21from __future__ import annotations 

22 

23import logging 

24import re 

25import tomllib 

26from pathlib import Path 

27from typing import TYPE_CHECKING, Any 

28 

29from mafw.mafw_errors import MissingAttribute, ParserConfigurationError, ParsingError 

30from mafw.processor import ActiveParameter, Processor 

31 

32log = logging.getLogger(__name__) 

33 

34 

35class FilenameElement: 

36 """ 

37 Helper class for the definition of filename element. 

38 

39 While importing an element to the DB, several parameters can be retrieved directly from the filename. The role of 

40 this helper class is to provide an easy way to define patterns in the filename representing a specific piece of 

41 information that has to be transferred to the DB. 

42 

43 The element is characterized by a name, a regular expression, the expected python type for the parsed value and an optional 

44 default value. The regular expression should contain a named group in the form ``?P<name>`` where name is matching 

45 the FilenameElement name. 

46 

47 To make a filename element optional, it is enough to provide a default value different from None. 

48 In this case, if the parsing is failing, then the default value will be returned. 

49 """ 

50 

51 type_lut: dict[str, type[str] | type[int] | type[float]] = {'str': str, 'int': int, 'float': float} 

52 """A lookup table for converting type definition as string into python types""" 

53 

54 def __init__( 

55 self, 

56 name: str, 

57 regex: str | re.Pattern[str], 

58 value_type: type = str, 

59 default_value: str | int | float | None = None, 

60 ) -> None: 

61 """ 

62 Constructor parameters: 

63 

64 :param name: The name of the filename element 

65 :type name: str 

66 :param regex: The regular expression associated to this filename element. It must contain a named group in the 

67 form ?P<name>. 

68 :type regex: str | re.Pattern[str] 

69 :param value_type: The type the output value should be converted into. It defaults to str. 

70 :type value_type: type, Optional 

71 :param default_value: The default value to assign to the filename element if the pattern is not found in the 

72 filename. It defaults to None 

73 :type default_value: Any, Optional 

74 """ 

75 self._variable_name: str = name 

76 if not isinstance(regex, re.Pattern): 

77 regex = re.compile(regex) 

78 

79 self._regex: re.Pattern[str] = regex 

80 self._value_type = value_type 

81 self._default_value = default_value 

82 self._value = default_value 

83 self._validate_default_type() 

84 self._validate_regexp() 

85 

86 def _validate_regexp(self) -> None: 

87 """ 

88 Checks if the regular expression contains a named group named after the element itself. 

89 

90 :raise ValueError: if the regular expression is not valid. 

91 """ 

92 pattern = self._regex.pattern 

93 group = rf'?P<{self._variable_name}>' 

94 if group not in pattern: 

95 raise ValueError('Attempt to create a FilenameElement with a regular expression without a named group.') 

96 

97 def _validate_default_type(self) -> None: 

98 """ 

99 Checks that the default has a type matching the value type. The check is actually performed if and only if a 

100 default value is provided. If None, then the validation is skipped. 

101 

102 :raise TypeError: if the default value type does not match the declared value type. 

103 """ 

104 if type(self._default_value) is not self._value_type and self._default_value is not None: 

105 raise TypeError( 

106 f'The type of the default value ({str(type(self._default_value))}) is not matching the ' 

107 f'declared value type ({str(self._value_type)})' 

108 ) 

109 

110 @classmethod 

111 def _get_value_type(cls, type_as_string: str) -> type: 

112 """ 

113 Returns the value type. 

114 

115 This method is used by the class method constructor to check if the user provided type in the form of a 

116 string is a valid one. 

117 

118 If so, then the corresponding python type is returned, otherwise a ValueError exception is raised. 

119 

120 :param type_as_string: The type of the value as a string. 

121 :type type_as_string: str 

122 :return: The corresponding python type. 

123 :rtype: type 

124 :raise ValueError: if type_as_string is not any of the acceptable type for the value. 

125 """ 

126 if type_as_string not in cls.type_lut: 

127 raise ValueError('Attempt to create a FilenameElement with a not available value type') 

128 

129 return cls.type_lut[type_as_string] 

130 

131 def reset(self) -> None: 

132 """ 

133 Resets the value to the default value. 

134 

135 **Remember:** that the default value is None for compulsory elements. 

136 """ 

137 self._value = self._default_value 

138 

139 @classmethod 

140 def from_dict(cls, name: str, info_dict: dict[str, str | int | float]) -> FilenameElement: 

141 """ 

142 Generates a FilenameElement starting from external information stored in a dictionary. 

143 

144 `info_dict` should contain the following three keys: 

145 - regexp: the Regular expression for the element search. 

146 - type: a string with the python type name (int, float, str) for the element conversion. 

147 - default (*optional*): a default value. 

148 

149 :param name: The name of the element. 

150 :type name: str 

151 :param info_dict: The dictionary with the required parameters for the class constructor. 

152 :type info_dict: dict 

153 :return: An instance of FilenameElement. 

154 :rtype: FilenameElement 

155 """ 

156 # get the regexp 

157 try: 

158 regexp = info_dict['regexp'] 

159 except KeyError: 

160 log.critical('Attempt to create a FilenameElement without a regular expression') 

161 raise 

162 # now let's check that the type of the regexp is acceptable 

163 if not isinstance(regexp, str): 

164 raise TypeError('Problem with regexp') 

165 

166 value_type_str = info_dict.get('type', 'str') 

167 # check that this is a string 

168 if not isinstance(value_type_str, str): 

169 raise ValueError('Attempt to create a FilenameElement with a wrong value type.') 

170 

171 value_type = cls._get_value_type(value_type_str) 

172 

173 return cls(name, regex=regexp, value_type=value_type, default_value=info_dict.get('default', None)) 

174 

175 @property 

176 def name(self) -> str: 

177 """Returns the class name""" 

178 return self._variable_name 

179 

180 @property 

181 def value(self) -> str | int | float | None: 

182 """Returns the class value""" 

183 return self._value 

184 

185 @property 

186 def is_optional(self) -> bool: 

187 """Returns if the element is optional""" 

188 return self._default_value is not None 

189 

190 @property 

191 def is_found(self) -> bool: 

192 """Returns if the file element is found""" 

193 if self.is_optional: 

194 return True 

195 else: 

196 return self._value != self._default_value 

197 

198 @property 

199 def pattern(self) -> str | bytes: 

200 """Returns the regular expression pattern""" 

201 return self._regex.pattern 

202 

203 def search(self, string: str | Path) -> None: 

204 """ 

205 Searches the string for the regular expression. 

206 

207 If the pattern is found in the string, then the matched value is transferred to the FilenameElement value. 

208 

209 .. note:: 

210 

211 This method is not returning the match value. It is only searching the input string for the 

212 registered pattern. If the pattern is found, then the user can retrieve the matched value by invoking the 

213 :meth:`.value` method. If the pattern is not found, the :meth:`.value` will return either None, 

214 for a compulsory element, or the default value for an optional one. 

215 

216 :param string: The string to be parsed. In most of the case, this is a filename, that is why the method is 

217 accepting also a Path type. 

218 :type string: str | Path 

219 """ 

220 self.reset() 

221 if isinstance(string, Path): 

222 string = str(string) 

223 result = re.search(self._regex, string) 

224 if result: 

225 self._value = self._value_type(result[self._variable_name]) 

226 

227 

228class FilenameParser: 

229 r""" 

230 Helper class to interpret all elements in a filename. 

231 

232 Inside a filename, there might be many elements containing information about the item that must be stored in the DB. 

233 This class will parse the filename, and after a successful identification of them all, it will make them available 

234 for the importer class to fill in the fields in the database. 

235 

236 The :class:`~FilenameParser` needs to be configured to be able to recognise each element in the filename. 

237 Such configuration is saved in a `toml` file. 

238 An example of such a configuration is provided :download:`here </_static/toml_files/filename_parser_conf.toml>`. 

239 

240 Each element must start with its name and a valid regular expression and a python type (in string). 

241 If an element is optional, then a default value must be provided as well. 

242 

243 After the configuration, the filename can be interpreted invoking the :meth:`~interpret` method. 

244 This will perform the actual parsing of the filename. 

245 If an error occurs during the parsing process, meaning that a compulsory element is not found, then the 

246 :class:`~.ParsingError` exception will be raised. 

247 So remember to protect the interpretation with a try/except block. 

248 

249 The value of each file element is available upon request. 

250 The user has simply to invoke the :meth:`~get_element_value` providing the element name. 

251 """ 

252 

253 def __init__(self, configuration_file: str | Path, filename: str | Path | None = None) -> None: 

254 """ 

255 Constructor parameters: 

256 

257 :param filename: The filename to be interpreted. 

258 :type filename: str | Path 

259 :param configuration_file: The configuration file for the interpreter. 

260 :type configuration_file: str | Path 

261 :raise ParserConfigurationError: If the configuration file is invalid. 

262 """ 

263 

264 #: The filename for this interpreter. If None, it should be specified before interpretation. 

265 self._filename = str(filename) if filename is not None else None 

266 #: The configuration file for the interpreter. 

267 self._configuration_file = configuration_file 

268 #: A dictionary with all the FilenameElement 

269 self._element_dict: dict[str, FilenameElement] = {} 

270 

271 self._parser_configuration() 

272 

273 def _parser_configuration(self) -> None: 

274 """ 

275 Loads the parser configuration, generates the required FilenameElement and adds them element dictionary. 

276 

277 The configuration file is stored in a TOML file. 

278 

279 This private method is automatically invoked by the class constructor. 

280 

281 :raise ParserConfigurationError: if the provided configuration file is invalid. 

282 """ 

283 with open(self._configuration_file, 'rb') as f: 

284 config = tomllib.load(f) 

285 

286 for element in config['elements']: 

287 if element not in config: 

288 raise ParserConfigurationError(f'Missing {element} table.') 

289 

290 self._element_dict[element] = FilenameElement.from_dict(element, config[element]) 

291 

292 @property 

293 def elements(self) -> dict[str, FilenameElement]: 

294 """Returns the filename element dictionary""" 

295 return self._element_dict 

296 

297 def interpret(self, filename: str | Path | None = None) -> None: 

298 """ 

299 Performs the interpretation of the filename. 

300 

301 The filename can be provided either as constructor argument or here as an argument. If both, then the local 

302 one will have the precedence. 

303 

304 :raises ParsingError: if a compulsory element is not found in the filename 

305 :raises MissingAttribute: if no filename has been specified. 

306 """ 

307 if self._filename is None and filename is None: 

308 raise MissingAttribute('Missing filename') 

309 

310 if filename: 

311 self.reset() 

312 self._filename = str(filename) 

313 

314 if TYPE_CHECKING: 

315 # self._filename is either set by the constructor or by the interpret method 

316 # at this stage it cannot be None 

317 assert self._filename is not None 

318 

319 for element in self._element_dict.values(): 

320 element.search(self._filename) 

321 if not element.is_found: 

322 raise ParsingError(f'Missing {element.name}') 

323 

324 def get_element(self, element_name: str) -> FilenameElement | None: 

325 """Gets the FilenameElement named element_name""" 

326 if element_name in self._element_dict: 

327 return self._element_dict[element_name] 

328 else: 

329 return None 

330 

331 def get_element_value(self, element_name: str) -> str | int | float | None: 

332 """ 

333 Gets the value of the FilenameElement named element_name. 

334 

335 It is equivalent to call ``self.get_element('element_name').value`` 

336 """ 

337 if element_name in self._element_dict: 

338 return self._element_dict[element_name].value 

339 else: 

340 return None 

341 

342 def reset(self) -> None: 

343 """Resets all filename elements""" 

344 for element in self._element_dict.values(): 

345 element.reset() 

346 

347 

348class Importer(Processor): 

349 """ 

350 Importer is the base class for importing elements in the Database structure. 

351 

352 It provides an easy skeleton to be subclassed by a more specific importer related to a certain project. 

353 

354 It can be customised with three processor parameters: 

355 

356 * The ``parser_configuration``: the path to the configuration file for the :class:`~.FilenameParser`. 

357 * The ``input_folder``: the path where the input files to be imported are. 

358 * The ``recursive`` flag: to specify if all subfolders should be also scanned. 

359 

360 For a concrete implementation, have a look at the :class:`~.ImporterExample` from the example library. 

361 """ 

362 

363 parser_configuration = ActiveParameter( 

364 'parser_configuration', 

365 default='parser_configuration.toml', 

366 help_doc='The path to the TOML file with the filename parser configuration ', 

367 ) 

368 input_folder = ActiveParameter( 

369 'input_folder', default=str(Path.cwd()), help_doc='The input folder from where the images have to be imported.' 

370 ) 

371 recursive = ActiveParameter('recursive', default=True, help_doc='Extend the search to sub-folder') 

372 

373 def __init__(self, *args: Any, **kwargs: Any): 

374 super().__init__(*args, **kwargs) 

375 self._filename_parser: FilenameParser 

376 """The filename parser instance""" 

377 

378 def format_progress_message(self) -> None: 

379 self.progress_message = f'[cyan]Importing element {self.i_item + 1} of {self.n_item}' 

380 

381 def start(self) -> None: 

382 """ 

383 The start method. 

384 

385 The filename parser is created using the provided configuration file. 

386 

387 :raise ParserConfigurationError: If the configuration file is not valid. 

388 """ 

389 super().start() 

390 if TYPE_CHECKING: 

391 assert isinstance(self.parser_configuration, str) 

392 

393 self._filename_parser = FilenameParser(self.parser_configuration)