parifinder extracts structured data from text using user-defined delimiters (strings or regex), making it versatile for data processing.
The function can handle a wide range of scenarios, making it versatile for parsing text with various delimiters. It can handle both single and multiple pairs of delimiters, whether they are simple strings or complex regular expressions. This flexibility makes it suitable for different use cases.
It can parse multiple pairs of delimiters within a given text, which is especially useful when dealing with documents or data containing nested elements.
It uses only Python's standard library
from parifinder import parse_pairs
from pprint import pprint
text_0 = """[[1, 2, 2], [5], [2, 3]], 12: [[4, 4, 4], [12, 0], [6, 6]], 3: [[1, 2]][[1, 2, 2], [5], [2, 3]], 12: [[4, 4, 4], [12, 0], [6, 6]], 3: [[1, 2]]"""
s1_0 = "["
s2_0 = "]"
r0 = parse_pairs(string=text_0, s1=s1_0, s2=s2_0, str_regex=False)
print("r0-----------------------------------------------------------------")
pprint(r0, indent=1, width=1)
text_1 = "<body><p>a</p><p>a</p><p>The HTML <code>button</code> tag defines a clickable button.</p><p>x</p><p>The CSS <code>background-color</code> property defines the background color of an element.</p></body></html>"
s1_1 = "<p>"
s2_1 = "</p>"
r1 = parse_pairs(string=text_1, s1=s1_1, s2=s2_1, str_regex=False)
print("r1-----------------------------------------------------------------")
pprint(r1, indent=1, width=1)
text_2 = "[1bla[2bla/2]/1]"
s1_2 = r"\[\d"
s2_2 = r"/\d]"
r2 = parse_pairs(string=text_2, s1=s1_2, s2=s2_2, str_regex=True)
print("r2-----------------------------------------------------------------")
pprint(r2, indent=1, width=1)
text_3 = "[1bla[2bla/2]/1]"
s1_3 = [("[1", "/1]"), ("[2", "/2]")]
s2_3 = None
r3 = parse_pairs(string=text_3, s1=s1_3, s2=s2_3, str_regex=False)
print("r3-----------------------------------------------------------------")
pprint(r3, indent=1, width=1)
text_4 = "[1bla[2bla/2]/1]"
s1_4 = ["[1", "[2"]
s2_4 = ["/1]", "/2]"]
r4 = parse_pairs(string=text_4, s1=s1_4, s2=s2_4, str_regex=False)
print("r4-----------------------------------------------------------------")
pprint(r4, indent=1, width=1)
# r0-----------------------------------------------------------------
# {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23): {'children': [(1,
# 2,
# 3,
# 4,
# 5,
# 6,
# 7,
# 8,
# 9),
# (17,
# 18,
# 19,
# 20,
# 21,
# 22),
# (12,
# 13,
# 14)],
# 'end': 23,
# 'parents': [],
# 'size': 23,
# 'start': 0,
# 'text': '[[1, '
# '2, '
# '2], '
# '[5], '
# '[2, '
# '3]]'},
# (1, 2, 3, 4, 5, 6, 7, 8, 9): {'children': [],
# 'end': 9,
# 'parents': [(0,
# 1,
# 2,
# 3,
# 4,
# 5,
# 6,
# 7,
# 8,
# 9,
# 10,
# 11,
# 12,
# 13,
# 14,
# 15,
# 16,
# 17,
# 18,
# 19,
# 20,
# 21,
# 22,
# 23)],
# 'size': 8,
# 'start': 1,
# 'text': '[1, '
# '2, '
# '2]'},
# (12, 13, 14): {'children': [],
# 'end': 14,
# 'parents': [(0,
# 1,
# 2,
# 3,
# 4,
# 5,
# 6,
# 7,
# 8,
# 9,
# 10,
# 11,
# 12,
# 13,
# 14,
# 15,
# 16,
# 17,
# 18,
# 19,
# 20,
# 21,
# 22,
# 23)],
# 'size': 2,
# 'start': 12,
# 'text': '[5]'},
# (17, 18, 19, 20, 21, 22): {'children': [],
# 'end': 22,
# 'parents': [(0,
# 1,
# 2,
# 3,
# 4,
# 5,
# 6,
# 7,
# 8,
# 9,
# 10,
# 11,
# 12,
# 13,
# 14,
# 15,
# 16,
# 17,
# 18,
# 19,
# 20,
# 21,
# 22,
# 23)],
# 'size': 5,
# 'start': 17,
# 'text': '[2, '
# '3]'},
# (30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57): {'children': [(31,
# 32,
# 33,
# 34,
# 35,
# 36,
# 37,
# 38,
# 39),
# (42,
# 43,
# 44,
# 45,
# 46,
# 47,
# 48),
# (51,
# 52,
# 53,
# 54,
# 55,
# 56)],
# 'end': 57,
# 'parents': [],
# 'size': 27,
# 'start': 30,
# 'text': '[[4, '
# '4, '
# '4], '
# '[12, '
# '0], '
# '[6, '
# '6]]'},
# (31, 32, 33, 34, 35, 36, 37, 38, 39): {'children': [],
# 'end': 39,
# 'parents': [(30,
# 31,
# 32,
# 33,
# 34,
# 35,
# 36,
# 37,
# 38,
# 39,
# 40,
# 41,
# 42,
# 43,
# 44,
# 45,
# 46,
# 47,
# 48,
# 49,
# 50,
# 51,
# 52,
# 53,
# 54,
# 55,
# 56,
# 57)],
# 'size': 8,
# 'start': 31,
# 'text': '[4, '
# '4, '
# '4]'},
# (42, 43, 44, 45, 46, 47, 48): {'children': [],
# 'end': 48,
# 'parents': [(30,
# 31,
# 32,
# 33,
# 34,
# 35,
# 36,
# 37,
# 38,
# 39,
# 40,
# 41,
# 42,
# 43,
# 44,
# 45,
# 46,
# 47,
# 48,
# 49,
# 50,
# 51,
# 52,
# 53,
# 54,
# 55,
# 56,
# 57)],
# 'size': 6,
# 'start': 42,
# 'text': '[12, '
# '0]'},
# (51, 52, 53, 54, 55, 56): {'children': [],
# 'end': 56,
# 'parents': [(30,
# 31,
# 32,
# 33,
# 34,
# 35,
# 36,
# 37,
# 38,
# 39,
# 40,
# 41,
# 42,
# 43,
# 44,
# 45,
# 46,
# 47,
# 48,
# 49,
# 50,
# 51,
# 52,
# 53,
# 54,
# 55,
# 56,
# 57)],
# 'size': 5,
# 'start': 51,
# 'text': '[6, '
# '6]'},
# (63, 64, 65, 66, 67, 68, 69, 70): {'children': [(64,
# 65,
# 66,
# 67,
# 68,
# 69)],
# 'end': 70,
# 'parents': [],
# 'size': 7,
# 'start': 63,
# 'text': '[[1, '
# '2]]'},
# (64, 65, 66, 67, 68, 69): {'children': [],
# 'end': 69,
# 'parents': [(63,
# 64,
# 65,
# 66,
# 67,
# 68,
# 69,
# 70)],
# 'size': 5,
# 'start': 64,
# 'text': '[1, '
# '2]'},
# (71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94): {'children': [(72,
# 73,
# 74,
# 75,
# 76,
# 77,
# 78,
# 79,
# 80),
# (88,
# 89,
# 90,
# 91,
# 92,
# 93),
# (83,
# 84,
# 85)],
# 'end': 94,
# 'parents': [],
# 'size': 23,
# 'start': 71,
# 'text': '[[1, '
# '2, '
# '2], '
# '[5], '
# '[2, '
# '3]]'},
# (72, 73, 74, 75, 76, 77, 78, 79, 80): {'children': [],
# 'end': 80,
# 'parents': [(71,
# 72,
# 73,
# 74,
# 75,
# 76,
# 77,
# 78,
# 79,
# 80,
# 81,
# 82,
# 83,
# 84,
# 85,
# 86,
# 87,
# 88,
# 89,
# 90,
# 91,
# 92,
# 93,
# 94)],
# 'size': 8,
# 'start': 72,
# 'text': '[1, '
# '2, '
# '2]'},
# (83, 84, 85): {'children': [],
# 'end': 85,
# 'parents': [(71,
# 72,
# 73,
# 74,
# 75,
# 76,
# 77,
# 78,
# 79,
# 80,
# 81,
# 82,
# 83,
# 84,
# 85,
# 86,
# 87,
# 88,
# 89,
# 90,
# 91,
# 92,
# 93,
# 94)],
# 'size': 2,
# 'start': 83,
# 'text': '[5]'},
# (88, 89, 90, 91, 92, 93): {'children': [],
# 'end': 93,
# 'parents': [(71,
# 72,
# 73,
# 74,
# 75,
# 76,
# 77,
# 78,
# 79,
# 80,
# 81,
# 82,
# 83,
# 84,
# 85,
# 86,
# 87,
# 88,
# 89,
# 90,
# 91,
# 92,
# 93,
# 94)],
# 'size': 5,
# 'start': 88,
# 'text': '[2, '
# '3]'},
# (101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128): {'children': [(102,
# 103,
# 104,
# 105,
# 106,
# 107,
# 108,
# 109,
# 110),
# (113,
# 114,
# 115,
# 116,
# 117,
# 118,
# 119),
# (122,
# 123,
# 124,
# 125,
# 126,
# 127)],
# 'end': 128,
# 'parents': [],
# 'size': 27,
# 'start': 101,
# 'text': '[[4, '
# '4, '
# '4], '
# '[12, '
# '0], '
# '[6, '
# '6]]'},
# (102, 103, 104, 105, 106, 107, 108, 109, 110): {'children': [],
# 'end': 110,
# 'parents': [(101,
# 102,
# 103,
# 104,
# 105,
# 106,
# 107,
# 108,
# 109,
# 110,
# 111,
# 112,
# 113,
# 114,
# 115,
# 116,
# 117,
# 118,
# 119,
# 120,
# 121,
# 122,
# 123,
# 124,
# 125,
# 126,
# 127,
# 128)],
# 'size': 8,
# 'start': 102,
# 'text': '[4, '
# '4, '
# '4]'},
# (113, 114, 115, 116, 117, 118, 119): {'children': [],
# 'end': 119,
# 'parents': [(101,
# 102,
# 103,
# 104,
# 105,
# 106,
# 107,
# 108,
# 109,
# 110,
# 111,
# 112,
# 113,
# 114,
# 115,
# 116,
# 117,
# 118,
# 119,
# 120,
# 121,
# 122,
# 123,
# 124,
# 125,
# 126,
# 127,
# 128)],
# 'size': 6,
# 'start': 113,
# 'text': '[12, '
# '0]'},
# (122, 123, 124, 125, 126, 127): {'children': [],
# 'end': 127,
# 'parents': [(101,
# 102,
# 103,
# 104,
# 105,
# 106,
# 107,
# 108,
# 109,
# 110,
# 111,
# 112,
# 113,
# 114,
# 115,
# 116,
# 117,
# 118,
# 119,
# 120,
# 121,
# 122,
# 123,
# 124,
# 125,
# 126,
# 127,
# 128)],
# 'size': 5,
# 'start': 122,
# 'text': '[6, '
# '6]'},
# (134, 135, 136, 137, 138, 139, 140, 141): {'children': [(135,
# 136,
# 137,
# 138,
# 139,
# 140)],
# 'end': 141,
# 'parents': [],
# 'size': 7,
# 'start': 134,
# 'text': '[[1, '
# '2]]'},
# (135, 136, 137, 138, 139, 140): {'children': [],
# 'end': 140,
# 'parents': [(134,
# 135,
# 136,
# 137,
# 138,
# 139,
# 140,
# 141)],
# 'size': 5,
# 'start': 135,
# 'text': '[1, '
# '2]'}}
# r1-----------------------------------------------------------------
# {(6, 7, 8, 9, 10, 11, 12, 13, 14): {'children': [],
# 'end': 14,
# 'parents': [],
# 'size': 9,
# 'start': 6,
# 'text': '<p>a</p>'},
# (14, 15, 16, 17, 18, 19, 20, 21, 22): {'children': [],
# 'end': 22,
# 'parents': [],
# 'size': 9,
# 'start': 14,
# 'text': '<p>a</p>'},
# (22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89): {'children': [],
# 'end': 89,
# 'parents': [],
# 'size': 68,
# 'start': 22,
# 'text': '<p>The '
# 'HTML '
# '<code>button</code> '
# 'tag '
# 'defines '
# 'a '
# 'clickable '
# 'button.</p>'},
# (89, 90, 91, 92, 93, 94, 95, 96, 97): {'children': [],
# 'end': 97,
# 'parents': [],
# 'size': 9,
# 'start': 89,
# 'text': '<p>x</p>'},
# (97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194): {'children': [],
# 'end': 194,
# 'parents': [],
# 'size': 98,
# 'start': 97,
# 'text': '<p>The '
# 'CSS '
# '<code>background-color</code> '
# 'property '
# 'defines '
# 'the '
# 'background '
# 'color '
# 'of '
# 'an '
# 'element.</p>'}}
# r2-----------------------------------------------------------------
# {('[1', '/1]'): {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16): {'children': [],
# 'end': 16,
# 'parents': [],
# 'size': 17,
# 'start': 0,
# 'text': '[1bla[2bla/2]/1]'}},
# ('[1', '/2]'): {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13): {'children': [],
# 'end': 13,
# 'parents': [],
# 'size': 14,
# 'start': 0,
# 'text': '[1bla[2bla/2]'}},
# ('[2', '/1]'): {(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16): {'children': [],
# 'end': 16,
# 'parents': [],
# 'size': 12,
# 'start': 5,
# 'text': '[2bla/2]/1]'}},
# ('[2', '/2]'): {(5, 6, 7, 8, 9, 10, 11, 12, 13): {'children': [],
# 'end': 13,
# 'parents': [],
# 'size': 9,
# 'start': 5,
# 'text': '[2bla/2]'}}}
# r3-----------------------------------------------------------------
# {('[1', '/1]'): {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16): {'children': [],
# 'end': 16,
# 'parents': [],
# 'size': 17,
# 'start': 0,
# 'text': '[1bla[2bla/2]/1]'}},
# ('[2', '/2]'): {(5, 6, 7, 8, 9, 10, 11, 12, 13): {'children': [],
# 'end': 13,
# 'parents': [],
# 'size': 9,
# 'start': 5,
# 'text': '[2bla/2]'}}}
# r4-----------------------------------------------------------------
# {('[1', '/1]'): {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16): {'children': [],
# 'end': 16,
# 'parents': [],
# 'size': 17,
# 'start': 0,
# 'text': '[1bla[2bla/2]/1]'}},
# ('[2', '/2]'): {(5, 6, 7, 8, 9, 10, 11, 12, 13): {'children': [],
# 'end': 13,
# 'parents': [],
# 'size': 9,
# 'start': 5,
# 'text': '[2bla/2]'}}}