I have this JSON
{
"journal.pbio.0050304.xml": {
"sentence": [
[
{"entity_group": "literal", "score": 0.9961686, "word": "The anterior\u2013posterior (A\u2013P) axis ", "start": 0, "end": 299}
],
[
{"entity_group": "literal", "score": 0.9932352, "word": "RA, Fgfs, and Wnts are all produced at the posterior of the embryo, and might therefore be expected to form posterior-", "start": 0, "end": 118},
{"entity_group": "metaphoric", "score": 0.874372, "word": "to", "start": 118, "end": 120},
{"entity_group": "literal", "score": 0.99049604, "word": "-anterior gradients (for Fgf8", "start": 120, "end": 149},
{"entity_group": "metaphoric", "score": 0.9993481, "word": "this", "start": 150, "end": 154}
]
]
},
"journal.pbio.0050093.xml": {
"sentence": [
[
{"entity_group": "literal", "score": 0.9961686, "word": "The anterior\u2013posterior (A\u2013P) axis ", "start": 0, "end": 299}
]
]
}
}
And I would like to take only entity groups, start and end and convert them into tuples, like this:
[(0, 299, 'literal'),(186, 194, 'literal'), ('metaphoric', 196, 199)]
, and so on. How can I do it?
CodePudding user response:
Something like this?
from pprint import pprint
data = {
"journal.pbio.0050304.xml": {
"sentence": [
[
{"entity_group": "literal", "score": 0.9961686, "word": "The anterior\u2013posterior (A\u2013P) axis ", "start": 0, "end": 299}
],
[
{"entity_group": "literal", "score": 0.9932352, "word": "RA, Fgfs, and Wnts are all produced at the posterior of the embryo, and might therefore be expected to form posterior-", "start": 0, "end": 118},
{"entity_group": "metaphoric", "score": 0.874372, "word": "to", "start": 118, "end": 120},
{"entity_group": "literal", "score": 0.99049604, "word": "-anterior gradients (for Fgf8", "start": 120, "end": 149},
{"entity_group": "metaphoric", "score": 0.9993481, "word": "this", "start": 150, "end": 154}
]
]
},
"journal.pbio.0050093.xml": {
"sentence": [
[
{"entity_group": "literal", "score": 0.9961686, "word": "The anterior\u2013posterior (A\u2013P) axis ", "start": 0, "end": 299}
]
]
}
}
for file in data.values():
for idx1, sentence in enumerate(file["sentence"]):
new_sentence = [word for word in sentence]
for idx2, word in enumerate(sentence):
new_sentence[idx2] = (word["start"], word["end"], word["entity_group"])
file["sentence"][idx1] = new_sentence
pprint(data)
Result:
{'journal.pbio.0050093.xml': {'sentence': [[(0, 299, 'literal')]]},
'journal.pbio.0050304.xml': {'sentence': [[(0, 299, 'literal')],
[(0, 118, 'literal'),
(118, 120, 'metaphoric'),
(120, 149, 'literal'),
(150, 154, 'metaphoric')]]}}
CodePudding user response:
You just need to iterate first over the dictionary values then the lists and sub-lists as follows:
data = {
"journal.pbio.0050304.xml": {
"sentence": [
[
{"entity_group": "literal", "score": 0.9961686, "word": "The anterior\u2013posterior (A\u2013P) axis ", "start": 0, "end": 299}
],
[
{"entity_group": "literal", "score": 0.9932352, "word": "RA, Fgfs, and Wnts are all produced at the posterior of the embryo, and might therefore be expected to form posterior-", "start": 0, "end": 118},
{"entity_group": "metaphoric", "score": 0.874372, "word": "to", "start": 118, "end": 120},
{"entity_group": "literal", "score": 0.99049604, "word": "-anterior gradients (for Fgf8", "start": 120, "end": 149},
{"entity_group": "metaphoric", "score": 0.9993481, "word": "this", "start": 150, "end": 154}
]
]
},
"journal.pbio.0050093.xml": {
"sentence": [
[
{"entity_group": "literal", "score": 0.9961686, "word": "The anterior\u2013posterior (A\u2013P) axis ", "start": 0, "end": 299}
]
]
}
}
output = []
for v in data.values():
for s in v.get('sentence', []):
for d in s:
output.append((d.get('start'), d.get('end'), d.get('entity_group')))
print(output)
Output:
[(0, 299, 'literal'), (0, 118, 'literal'), (118, 120, 'metaphoric'), (120, 149, 'literal'), (150, 154, 'metaphoric'), (0, 299, 'literal')]