Coverage for src / mafw / tools / regexp.py: 100%

25 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-09 09:08 +0000

1# Copyright 2025 European Union 

2# Author: Bulgheroni Antonio (antonio.bulgheroni@ec.europa.eu) 

3# SPDX-License-Identifier: EUPL-1.2 

4""" 

5Module implements some basic functions involving regular expressions. 

6""" 

7 

8import logging 

9import re 

10 

11from mafw.mafw_errors import UnknownProcessor 

12 

13log = logging.getLogger(__name__) 

14 

15 

16def extract_protocol(url: str) -> str | None: 

17 """ 

18 Extract the protocol portion from a database connection URL. 

19 

20 The extract_protocol function takes a database connection URL string as input and extracts the protocol portion 

21 (the part before "://"). This function is useful for identifying the database type from connection strings. 

22 

23 :param url: The url from which the protocol will be extracted. 

24 :type url: str 

25 :return: The protocol or None, if the extraction failed 

26 :rtype: str | None 

27 """ 

28 pattern = r'^([a-z0-9_\-+.]+)://' 

29 match = re.match(pattern, url) 

30 if match: 

31 return match.group(1) 

32 return None 

33 

34 

35def normalize_sql_spaces(sql_string: str) -> str: 

36 """ 

37 Normalize multiple consecutive spaces in SQL string to single spaces. 

38 Only handles spaces, preserves other whitespace characters. 

39 

40 :param sql_string: The SQL string for space normalization. 

41 :type sql_string: str 

42 :return: The normalized SQL command. 

43 :rtype: str 

44 """ 

45 return re.sub(r' +', ' ', sql_string.strip()) 

46 

47 

48def parse_processor_name(processor_string: str) -> tuple[str, str | None]: 

49 """ 

50 Parse a processor name string into name and replica identifier components. 

51 

52 Given a string in the form 'MyProcessorName#156a', returns a tuple ('MyProcessorName', '156a'). 

53 If the input string is 'MyProcessorName' only, then it returns ('MyProcessorName', None). 

54 If it gets 'MyProcessorName#', it returns ('MyProcessorName', None) but emits a warning 

55 informing of a possible malformed name. 

56 

57 The processor name must be a valid Python identifier (class name). 

58 

59 :param processor_string: The processor name string to parse. 

60 :type processor_string: str 

61 :return: A tuple of (name, replica_id) where replica_id can be None. 

62 :rtype: tuple[str, str | None] 

63 :raise UnknownProcessor: if the name part is empty or not a valid Python identifier 

64 """ 

65 # Split on '#' character 

66 parts = processor_string.strip().split('#', 1) 

67 

68 # Get the name part (always exists) 

69 name = parts[0] 

70 

71 if len(name) == 0: 

72 raise UnknownProcessor('Invalid processor name (empty)') 

73 

74 # Validate that the name is a valid Python identifier 

75 if not name.isidentifier(): 

76 raise UnknownProcessor(f'Invalid processor name "{name}" - not a valid Python identifier') 

77 

78 # Check if there's a replica part 

79 if len(parts) == 1 or parts[1] == '': 

80 # No or empty replica id part 

81 if len(parts) == 2 and parts[1] == '': 

82 # Warn about malformed input like "Name#" 

83 log.warning( 

84 f"Malformed processor name '{processor_string}': empty replica part after '#'", 

85 ) 

86 return name, None 

87 else: 

88 replica_id = parts[1] 

89 return name, replica_id