I need to correct bad encodings from a string.
This is an example of what it should be \u00c3\u00ba=ú, \u00c3\u00b1=ñ, \u00c3\u00b3=ó
This is the string:
x = '[{"op": "core/column-reorder", "columnNames": ["\u00ef\u00bb\u00bfN\u00c3\u00bamero de Cliente", "[Account.AccountCode?]", "CRM Origen", "Monto inicial", "[ModInitCta?]", "[DeudaRealCuenta?]", "[BillCycleName?]", "Nombre Campa\u00c3\u00b1a", "Nombre Casa de Cobro", "Fecha de Asignacion", "Deuda Gestionable", "Direcci\u00c3\u00b3n Completa", "Fecha Final", "Email", "Telefono 1", "Telefono 2", "Telefono 3", "Telefono 4", "Segmento", "[Documento?]", "[AccStsName?]", "Ciudad", "[InboxName?]", "Nombre del Cliente", "Id de Ejecucion", "Fecha de Vencimiento", "Numero Referencia de Pago", "MIN", "Plan", "Precio Subscripcion"], "description": "Reorder columns"}, {"op": "core/column-addition", "engineConfig": {"facets": [], "mode": "row-based"}, "baseColumnName": "Telefono 1", "expression": "join ([coalesce(cells['Telefono 1'].value,''),coalesce(cells['Telefono 2'].value,''),coalesce(cells['Telefono 3'].value,''),coalesce(cells['Telefono 4'].value,'')],' / ')", "onError": "keep-original", "newColumnName": "Telefonos", "columnInsertIndex": 15, "description": "Create column Telefonos at index 15 based on column Telefono 1 using expression join ([coalesce(cells['Telefono 1'].value,''),coalesce(cells['Telefono 2'].value,''),coalesce(cells['Telefono 3'].value,''),coalesce(cells['Telefono 4'].value,'')],' / ')"}, {"op": "core/column-reorder", "columnNames": ["\u00ef\u00bb\u00bfN\u00c3\u00bamero de Cliente", "[Account.AccountCode?]", "CRM Origen", "Monto inicial", "[ModInitCta?]", "[DeudaRealCuenta?]", "[BillCycleName?]", "Nombre Campa\u00c3\u00b1a", "Nombre Casa de Cobro", "Fecha de Asignacion", "Deuda Gestionable", "Direcci\u00c3\u00b3n Completa", "Fecha Final", "Email", "Telefonos", "Segmento", "[Documento?]", "[AccStsName?]", "Ciudad", "[InboxName?]", "Nombre del Cliente", "Id de Ejecucion", "Fecha de Vencimiento", "Numero Referencia de Pago", "MIN", "Plan", "Precio Subscripcion"], "description": "Reorder columns"}, {"op": "core/column-reorder", "columnNames": ["\u00ef\u00bb\u00bfN\u00c3\u00bamero de Cliente", "[Account.AccountCode?]", "CRM Origen", "Monto inicial", "[ModInitCta?]", "[DeudaRealCuenta?]", "[BillCycleName?]", "Nombre Campa\u00c3\u00b1a", "Nombre Casa de Cobro", "Fecha de Asignacion", "Deuda Gestionable", "Direcci\u00c3\u00b3n Completa", "Fecha Final", "Email", "Telefonos", "Segmento", "Ciudad", "[InboxName?]", "Nombre del Cliente", "Id de Ejecucion", "Fecha de Vencimiento", "Numero Referencia de Pago", "Plan", "Precio Subscripcion"], "description": "Reorder columns"}, {"op": "core/text-transform", "engineConfig": {"facets": [], "mode": "row-based"}, "columnName": "Monto inicial", "expression": "value.toNumber()", "onError": "keep-original", "repeat": false, "repeatCount": 10, "description": "Text transform on cells in column Monto inicial using expression value.toNumber()"}, {"op": "core/text-transform", "engineConfig": {"facets": [], "mode": "row-based"}, "columnName": "Direcci\u00c3\u00b3n Completa", "expression": "value.toTitlecase()", "onError": "keep-original", "repeat": false, "repeatCount": 10, "description": "Text transform on cells in column Direcci\u00c3\u00b3n Completa using expression value.toTitlecase()"}, {"op": "core/text-transform", "engineConfig": {"facets": [], "mode": "row-based"}, "columnName": "\u00ef\u00bb\u00bfN\u00c3\u00bamero de Cliente", "expression": "grel:value.substring(0, value.length()-2)", "onError": "keep-original", "repeat": false, "repeatCount": 10, "description": "Text transform on cells in column \u00ef\u00bb\u00bfN\u00c3\u00bamero de Cliente using expression grel:value.substring(0, value.length()-2)"}, {"op": "core/text-transform", "engineConfig": {"facets": [], "mode": "row-based"}, "columnName": "Nombre Campa\u00c3\u00b1a", "expression": "grel:value.substring(4)", "onError": "keep-original", "repeat": false, "repeatCount": 10, "description": "Text transform on cells in column Nombre Campa\u00c3\u00b1a using expression grel:value.substring(4)"}]'
This is the code I use to get the string:
import json
import ftfy
with open('get-operations.json') as j:
data = json.load(j)
new_data = []
for dic in data['entries']:
for key,value in dic.items():
if key == 'operation':
dic = {k:v for k,v in value.items()}
new_data.append(dic)
x = json.dumps(new_data)
print(ftfy.ftfy(x))
I just want to extract the "operations" from this json file (get-operations.json):
{"entries":[{"description":"Reorder columns","operation":{"op":"core/column-reorder","columnNames":["Número de Cliente","[Account.AccountCode?]","CRM Origen","Monto inicial","[ModInitCta?]","[DeudaRealCuenta?]","[BillCycleName?]","Nombre Campaña","Nombre Casa de Cobro","Fecha de Asignacion","Deuda Gestionable","Dirección Completa","Fecha Final","Email","Telefono 1","Telefono 2","Telefono 3","Telefono 4","Segmento","[Documento?]","[AccStsName?]","Ciudad","[InboxName?]","Nombre del Cliente","Id de Ejecucion","Fecha de Vencimiento","Numero Referencia de Pago","MIN","Plan","Precio Subscripcion"],"description":"Reorder columns"}},{"description":"Create new column Telefonos based on column Telefono 1 by filling 99 rows with join ([coalesce(cells['Telefono 1'].value,''),coalesce(cells['Telefono 2'].value,''),coalesce(cells['Telefono 3'].value,''),coalesce(cells['Telefono 4'].value,'')],' / ')","operation":{"op":"core/column-addition","engineConfig":{"facets":[],"mode":"row-based"},"baseColumnName":"Telefono 1","expression":"join ([coalesce(cells['Telefono 1'].value,''),coalesce(cells['Telefono 2'].value,''),coalesce(cells['Telefono 3'].value,''),coalesce(cells['Telefono 4'].value,'')],' / ')","onError":"keep-original","newColumnName":"Telefonos","columnInsertIndex":15,"description":"Create column Telefonos at index 15 based on column Telefono 1 using expression join ([coalesce(cells['Telefono 1'].value,''),coalesce(cells['Telefono 2'].value,''),coalesce(cells['Telefono 3'].value,''),coalesce(cells['Telefono 4'].value,'')],' / ')"}},{"description":"Reorder columns","operation":{"op":"core/column-reorder","columnNames":["Número de Cliente","[Account.AccountCode?]","CRM Origen","Monto inicial","[ModInitCta?]","[DeudaRealCuenta?]","[BillCycleName?]","Nombre Campaña","Nombre Casa de Cobro","Fecha de Asignacion","Deuda Gestionable","Dirección Completa","Fecha Final","Email","Telefonos","Segmento","[Documento?]","[AccStsName?]","Ciudad","[InboxName?]","Nombre del Cliente","Id de Ejecucion","Fecha de Vencimiento","Numero Referencia de Pago","MIN","Plan","Precio Subscripcion"],"description":"Reorder columns"}},{"description":"Reorder columns","operation":{"op":"core/column-reorder","columnNames":["Número de Cliente","[Account.AccountCode?]","CRM Origen","Monto inicial","[ModInitCta?]","[DeudaRealCuenta?]","[BillCycleName?]","Nombre Campaña","Nombre Casa de Cobro","Fecha de Asignacion","Deuda Gestionable","Dirección Completa","Fecha Final","Email","Telefonos","Segmento","Ciudad","[InboxName?]","Nombre del Cliente","Id de Ejecucion","Fecha de Vencimiento","Numero Referencia de Pago","Plan","Precio Subscripcion"],"description":"Reorder columns"}},{"description":"Text transform on 99 cells in column Monto inicial: value.toNumber()","operation":{"op":"core/text-transform","engineConfig":{"facets":[],"mode":"row-based"},"columnName":"Monto inicial","expression":"value.toNumber()","onError":"keep-original","repeat":false,"repeatCount":10,"description":"Text transform on cells in column Monto inicial using expression value.toNumber()"}},{"description":"Text transform on 94 cells in column Dirección Completa: value.toTitlecase()","operation":{"op":"core/text-transform","engineConfig":{"facets":[],"mode":"row-based"},"columnName":"Dirección Completa","expression":"value.toTitlecase()","onError":"keep-original","repeat":false,"repeatCount":10,"description":"Text transform on cells in column Dirección Completa using expression value.toTitlecase()"}},{"description":"Text transform on 99 cells in column Número de Cliente: grel:value.substring(0, value.length()-2)","operation":{"op":"core/text-transform","engineConfig":{"facets":[],"mode":"row-based"},"columnName":"Número de Cliente","expression":"grel:value.substring(0, value.length()-2)","onError":"keep-original","repeat":false,"repeatCount":10,"description":"Text transform on cells in column Número de Cliente using expression grel:value.substring(0, value.length()-2)"}},{"description":"Text transform on 99 cells in column Nombre Campaña: grel:value.substring(4)","operation":{"op":"core/text-transform","engineConfig":{"facets":[],"mode":"row-based"},"columnName":"Nombre Campaña","expression":"grel:value.substring(4)","onError":"keep-original","repeat":false,"repeatCount":10,"description":"Text transform on cells in column Nombre Campaña using expression grel:value.substring(4)"}}]}
I've tried to use the ftfy module but it doesn't work. Any idea how I can fix it?
CodePudding user response:
Use the right encoding when reading the file, and use ensure_ascii=False
when writing the JSON to see human-readable non-ASCII characters. It is still valid JSON as ASCII it is just a visual issue:
import json
with open('get-operations.json', encoding='utf-8-sig') as j:
data = json.load(j)
new_data = []
for dic in data['entries']:
for key,value in dic.items():
if key == 'operation':
new_data.append(value) # original line was over complicated :)
x = json.dumps(new_data, ensure_ascii=False)
print(x)
Output:
[{"op": "core/column-reorder", "columnNames": ["Número de Cliente", "[Account.AccountCode?]", "CRM Origen", "Monto inicial", "[ModInitCta?]", "[DeudaRealCuenta?]", "[BillCycleName?]", "Nombre Campaña", "Nombre Casa de Cobro", "Fecha de Asignacion", "Deuda Gestionable", "Dirección Completa", "Fecha Final", "Email", "Telefono 1", "Telefono 2", "Telefono 3", "Telefono 4", "Segmento", "[Documento?]", "[AccStsName?]", "Ciudad", "[InboxName?]", "Nombre del Cliente", "Id de Ejecucion", "Fecha de Vencimiento", "Numero Referencia de Pago", "MIN", "Plan", "Precio Subscripcion"], "description": "Reorder columns"}, {"op": "core/column-addition", "engineConfig": {"facets": [], "mode": "row-based"}, "baseColumnName": "Telefono 1", "expression": "join ([coalesce(cells['Telefono 1'].value,''),coalesce(cells['Telefono 2'].value,''),coalesce(cells['Telefono 3'].value,''),coalesce(cells['Telefono 4'].value,'')],' / ')", "onError": "keep-original", "newColumnName": "Telefonos", "columnInsertIndex": 15, "description": "Create column Telefonos at index 15 based on column Telefono 1 using expression join ([coalesce(cells['Telefono 1'].value,''),coalesce(cells['Telefono 2'].value,''),coalesce(cells['Telefono 3'].value,''),coalesce(cells['Telefono 4'].value,'')],' / ')"}, {"op": "core/column-reorder", "columnNames": ["Número de Cliente", "[Account.AccountCode?]", "CRM Origen", "Monto inicial", "[ModInitCta?]", "[DeudaRealCuenta?]", "[BillCycleName?]", "Nombre Campaña", "Nombre Casa de Cobro", "Fecha de Asignacion", "Deuda Gestionable", "Dirección Completa", "Fecha Final", "Email", "Telefonos", "Segmento", "[Documento?]", "[AccStsName?]", "Ciudad", "[InboxName?]", "Nombre del Cliente", "Id de Ejecucion", "Fecha de Vencimiento", "Numero Referencia de Pago", "MIN", "Plan", "Precio Subscripcion"], "description": "Reorder columns"}, {"op": "core/column-reorder", "columnNames": ["Número de Cliente", "[Account.AccountCode?]", "CRM Origen", "Monto inicial", "[ModInitCta?]", "[DeudaRealCuenta?]", "[BillCycleName?]", "Nombre Campaña", "Nombre Casa de Cobro", "Fecha de Asignacion", "Deuda Gestionable", "Dirección Completa", "Fecha Final", "Email", "Telefonos", "Segmento", "Ciudad", "[InboxName?]", "Nombre del Cliente", "Id de Ejecucion", "Fecha de Vencimiento", "Numero Referencia de Pago", "Plan", "Precio Subscripcion"], "description": "Reorder columns"}, {"op": "core/text-transform", "engineConfig": {"facets": [], "mode": "row-based"}, "columnName": "Monto inicial", "expression": "value.toNumber()", "onError": "keep-original", "repeat": false, "repeatCount": 10, "description": "Text transform on cells in column Monto inicial using expression value.toNumber()"}, {"op": "core/text-transform", "engineConfig": {"facets": [], "mode": "row-based"}, "columnName": "Dirección Completa", "expression": "value.toTitlecase()", "onError": "keep-original", "repeat": false, "repeatCount": 10, "description": "Text transform on cells in column Dirección Completa using expression value.toTitlecase()"}, {"op": "core/text-transform", "engineConfig": {"facets": [], "mode": "row-based"}, "columnName": "Número de Cliente", "expression": "grel:value.substring(0, value.length()-2)", "onError": "keep-original", "repeat": false, "repeatCount": 10, "description": "Text transform on cells in column Número de Cliente using expression grel:value.substring(0, value.length()-2)"}, {"op": "core/text-transform", "engineConfig": {"facets": [], "mode": "row-based"}, "columnName": "Nombre Campaña", "expression": "grel:value.substring(4)", "onError": "keep-original", "repeat": false, "repeatCount": 10, "description": "Text transform on cells in column Nombre Campaña using expression grel:value.substring(4)"}]
You can also use indent=2
in the json.dumps()
to get "pretty" output:
[
{
"op": "core/column-reorder",
"columnNames": [
"Número de Cliente",
"[Account.AccountCode?]",
"CRM Origen",
"Monto inicial",
"[ModInitCta?]",
"[DeudaRealCuenta?]",
"[BillCycleName?]",
"Nombre Campaña",
"Nombre Casa de Cobro",
"Fecha de Asignacion",
"Deuda Gestionable",
"Dirección Completa",
"Fecha Final",
"Email",
"Telefono 1",
"Telefono 2",
"Telefono 3",
"Telefono 4",
"Segmento",
"[Documento?]",
"[AccStsName?]",
"Ciudad",
"[InboxName?]",
"Nombre del Cliente",
"Id de Ejecucion",
"Fecha de Vencimiento",
"Numero Referencia de Pago",
"MIN",
"Plan",
"Precio Subscripcion"
],
"description": "Reorder columns"
},
{
"op": "core/column-addition",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"baseColumnName": "Telefono 1",
"expression": "join ([coalesce(cells['Telefono 1'].value,''),coalesce(cells['Telefono 2'].value,''),coalesce(cells['Telefono 3'].value,''),coalesce(cells['Telefono 4'].value,'')],' / ')",
"onError": "keep-original",
"newColumnName": "Telefonos",
"columnInsertIndex": 15,
"description": "Create column Telefonos at index 15 based on column Telefono 1 using expression join ([coalesce(cells['Telefono 1'].value,''),coalesce(cells['Telefono 2'].value,''),coalesce(cells['Telefono 3'].value,''),coalesce(cells['Telefono 4'].value,'')],' / ')"
},
{
"op": "core/column-reorder",
"columnNames": [
"Número de Cliente",
"[Account.AccountCode?]",
"CRM Origen",
"Monto inicial",
"[ModInitCta?]",
"[DeudaRealCuenta?]",
"[BillCycleName?]",
"Nombre Campaña",
"Nombre Casa de Cobro",
"Fecha de Asignacion",
"Deuda Gestionable",
"Dirección Completa",
"Fecha Final",
"Email",
"Telefonos",
"Segmento",
"[Documento?]",
"[AccStsName?]",
"Ciudad",
"[InboxName?]",
"Nombre del Cliente",
"Id de Ejecucion",
"Fecha de Vencimiento",
"Numero Referencia de Pago",
"MIN",
"Plan",
"Precio Subscripcion"
],
"description": "Reorder columns"
},
{
"op": "core/column-reorder",
"columnNames": [
"Número de Cliente",
"[Account.AccountCode?]",
"CRM Origen",
"Monto inicial",
"[ModInitCta?]",
"[DeudaRealCuenta?]",
"[BillCycleName?]",
"Nombre Campaña",
"Nombre Casa de Cobro",
"Fecha de Asignacion",
"Deuda Gestionable",
"Dirección Completa",
"Fecha Final",
"Email",
"Telefonos",
"Segmento",
"Ciudad",
"[InboxName?]",
"Nombre del Cliente",
"Id de Ejecucion",
"Fecha de Vencimiento",
"Numero Referencia de Pago",
"Plan",
"Precio Subscripcion"
],
"description": "Reorder columns"
},
{
"op": "core/text-transform",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "Monto inicial",
"expression": "value.toNumber()",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10,
"description": "Text transform on cells in column Monto inicial using expression value.toNumber()"
},
{
"op": "core/text-transform",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "Dirección Completa",
"expression": "value.toTitlecase()",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10,
"description": "Text transform on cells in column Dirección Completa using expression value.toTitlecase()"
},
{
"op": "core/text-transform",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "Número de Cliente",
"expression": "grel:value.substring(0, value.length()-2)",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10,
"description": "Text transform on cells in column Número de Cliente using expression grel:value.substring(0, value.length()-2)"
},
{
"op": "core/text-transform",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "Nombre Campaña",
"expression": "grel:value.substring(4)",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10,
"description": "Text transform on cells in column Nombre Campaña using expression grel:value.substring(4)"
}
]