Home > OS >  How to fix bad encodings in a string?
How to fix bad encodings in a string?

Time:07-15

I need to correct bad encodings from a string.

This is an example of what it should be \u00c3\u00ba=ú, \u00c3\u00b1=ñ, \u00c3\u00b3=ó

This is the string:

x = '[{"op": "core/column-reorder", "columnNames": ["\u00ef\u00bb\u00bfN\u00c3\u00bamero de Cliente", "[Account.AccountCode?]", "CRM Origen", "Monto inicial", "[ModInitCta?]", "[DeudaRealCuenta?]", "[BillCycleName?]", "Nombre Campa\u00c3\u00b1a", "Nombre Casa de Cobro", "Fecha de Asignacion", "Deuda Gestionable", "Direcci\u00c3\u00b3n Completa", "Fecha Final", "Email", "Telefono 1", "Telefono 2", "Telefono 3", "Telefono 4", "Segmento", "[Documento?]", "[AccStsName?]", "Ciudad", "[InboxName?]", "Nombre del Cliente", "Id de Ejecucion", "Fecha de Vencimiento", "Numero Referencia de Pago", "MIN", "Plan", "Precio Subscripcion"], "description": "Reorder columns"}, {"op": "core/column-addition", "engineConfig": {"facets": [], "mode": "row-based"}, "baseColumnName": "Telefono 1", "expression": "join ([coalesce(cells['Telefono 1'].value,''),coalesce(cells['Telefono 2'].value,''),coalesce(cells['Telefono 3'].value,''),coalesce(cells['Telefono 4'].value,'')],' / ')", "onError": "keep-original", "newColumnName": "Telefonos", "columnInsertIndex": 15, "description": "Create column Telefonos at index 15 based on column Telefono 1 using expression join ([coalesce(cells['Telefono 1'].value,''),coalesce(cells['Telefono 2'].value,''),coalesce(cells['Telefono 3'].value,''),coalesce(cells['Telefono 4'].value,'')],' / ')"}, {"op": "core/column-reorder", "columnNames": ["\u00ef\u00bb\u00bfN\u00c3\u00bamero de Cliente", "[Account.AccountCode?]", "CRM Origen", "Monto inicial", "[ModInitCta?]", "[DeudaRealCuenta?]", "[BillCycleName?]", "Nombre Campa\u00c3\u00b1a", "Nombre Casa de Cobro", "Fecha de Asignacion", "Deuda Gestionable", "Direcci\u00c3\u00b3n Completa", "Fecha Final", "Email", "Telefonos", "Segmento", "[Documento?]", "[AccStsName?]", "Ciudad", "[InboxName?]", "Nombre del Cliente", "Id de Ejecucion", "Fecha de Vencimiento", "Numero Referencia de Pago", "MIN", "Plan", "Precio Subscripcion"], "description": "Reorder columns"}, {"op": "core/column-reorder", "columnNames": ["\u00ef\u00bb\u00bfN\u00c3\u00bamero de Cliente", "[Account.AccountCode?]", "CRM Origen", "Monto inicial", "[ModInitCta?]", "[DeudaRealCuenta?]", "[BillCycleName?]", "Nombre Campa\u00c3\u00b1a", "Nombre Casa de Cobro", "Fecha de Asignacion", "Deuda Gestionable", "Direcci\u00c3\u00b3n Completa", "Fecha Final", "Email", "Telefonos", "Segmento", "Ciudad", "[InboxName?]", "Nombre del Cliente", "Id de Ejecucion", "Fecha de Vencimiento", "Numero Referencia de Pago", "Plan", "Precio Subscripcion"], "description": "Reorder columns"}, {"op": "core/text-transform", "engineConfig": {"facets": [], "mode": "row-based"}, "columnName": "Monto inicial", "expression": "value.toNumber()", "onError": "keep-original", "repeat": false, "repeatCount": 10, "description": "Text transform on cells in column Monto inicial using expression value.toNumber()"}, {"op": "core/text-transform", "engineConfig": {"facets": [], "mode": "row-based"}, "columnName": "Direcci\u00c3\u00b3n Completa", "expression": "value.toTitlecase()", "onError": "keep-original", "repeat": false, "repeatCount": 10, "description": "Text transform on cells in column Direcci\u00c3\u00b3n Completa using expression value.toTitlecase()"}, {"op": "core/text-transform", "engineConfig": {"facets": [], "mode": "row-based"}, "columnName": "\u00ef\u00bb\u00bfN\u00c3\u00bamero de Cliente", "expression": "grel:value.substring(0, value.length()-2)", "onError": "keep-original", "repeat": false, "repeatCount": 10, "description": "Text transform on cells in column \u00ef\u00bb\u00bfN\u00c3\u00bamero de Cliente using expression grel:value.substring(0, value.length()-2)"}, {"op": "core/text-transform", "engineConfig": {"facets": [], "mode": "row-based"}, "columnName": "Nombre Campa\u00c3\u00b1a", "expression": "grel:value.substring(4)", "onError": "keep-original", "repeat": false, "repeatCount": 10, "description": "Text transform on cells in column Nombre Campa\u00c3\u00b1a using expression grel:value.substring(4)"}]'

This is the code I use to get the string:

import json
import ftfy

with open('get-operations.json') as j:
    data = json.load(j)
new_data = []
for dic in data['entries']:
    for key,value in dic.items():
        if key == 'operation':
            dic = {k:v for k,v in value.items()}
            new_data.append(dic)

x = json.dumps(new_data)
print(ftfy.ftfy(x))

I just want to extract the "operations" from this json file (get-operations.json):

{"entries":[{"description":"Reorder columns","operation":{"op":"core/column-reorder","columnNames":["Número de Cliente","[Account.AccountCode?]","CRM Origen","Monto inicial","[ModInitCta?]","[DeudaRealCuenta?]","[BillCycleName?]","Nombre Campaña","Nombre Casa de Cobro","Fecha de Asignacion","Deuda Gestionable","Dirección Completa","Fecha Final","Email","Telefono 1","Telefono 2","Telefono 3","Telefono 4","Segmento","[Documento?]","[AccStsName?]","Ciudad","[InboxName?]","Nombre del Cliente","Id de Ejecucion","Fecha de Vencimiento","Numero Referencia de Pago","MIN","Plan","Precio Subscripcion"],"description":"Reorder columns"}},{"description":"Create new column Telefonos based on column Telefono 1 by filling 99 rows with join ([coalesce(cells['Telefono 1'].value,''),coalesce(cells['Telefono 2'].value,''),coalesce(cells['Telefono 3'].value,''),coalesce(cells['Telefono 4'].value,'')],' / ')","operation":{"op":"core/column-addition","engineConfig":{"facets":[],"mode":"row-based"},"baseColumnName":"Telefono 1","expression":"join ([coalesce(cells['Telefono 1'].value,''),coalesce(cells['Telefono 2'].value,''),coalesce(cells['Telefono 3'].value,''),coalesce(cells['Telefono 4'].value,'')],' / ')","onError":"keep-original","newColumnName":"Telefonos","columnInsertIndex":15,"description":"Create column Telefonos at index 15 based on column Telefono 1 using expression join ([coalesce(cells['Telefono 1'].value,''),coalesce(cells['Telefono 2'].value,''),coalesce(cells['Telefono 3'].value,''),coalesce(cells['Telefono 4'].value,'')],' / ')"}},{"description":"Reorder columns","operation":{"op":"core/column-reorder","columnNames":["Número de Cliente","[Account.AccountCode?]","CRM Origen","Monto inicial","[ModInitCta?]","[DeudaRealCuenta?]","[BillCycleName?]","Nombre Campaña","Nombre Casa de Cobro","Fecha de Asignacion","Deuda Gestionable","Dirección Completa","Fecha Final","Email","Telefonos","Segmento","[Documento?]","[AccStsName?]","Ciudad","[InboxName?]","Nombre del Cliente","Id de Ejecucion","Fecha de Vencimiento","Numero Referencia de Pago","MIN","Plan","Precio Subscripcion"],"description":"Reorder columns"}},{"description":"Reorder columns","operation":{"op":"core/column-reorder","columnNames":["Número de Cliente","[Account.AccountCode?]","CRM Origen","Monto inicial","[ModInitCta?]","[DeudaRealCuenta?]","[BillCycleName?]","Nombre Campaña","Nombre Casa de Cobro","Fecha de Asignacion","Deuda Gestionable","Dirección Completa","Fecha Final","Email","Telefonos","Segmento","Ciudad","[InboxName?]","Nombre del Cliente","Id de Ejecucion","Fecha de Vencimiento","Numero Referencia de Pago","Plan","Precio Subscripcion"],"description":"Reorder columns"}},{"description":"Text transform on 99 cells in column Monto inicial: value.toNumber()","operation":{"op":"core/text-transform","engineConfig":{"facets":[],"mode":"row-based"},"columnName":"Monto inicial","expression":"value.toNumber()","onError":"keep-original","repeat":false,"repeatCount":10,"description":"Text transform on cells in column Monto inicial using expression value.toNumber()"}},{"description":"Text transform on 94 cells in column Dirección Completa: value.toTitlecase()","operation":{"op":"core/text-transform","engineConfig":{"facets":[],"mode":"row-based"},"columnName":"Dirección Completa","expression":"value.toTitlecase()","onError":"keep-original","repeat":false,"repeatCount":10,"description":"Text transform on cells in column Dirección Completa using expression value.toTitlecase()"}},{"description":"Text transform on 99 cells in column Número de Cliente: grel:value.substring(0, value.length()-2)","operation":{"op":"core/text-transform","engineConfig":{"facets":[],"mode":"row-based"},"columnName":"Número de Cliente","expression":"grel:value.substring(0, value.length()-2)","onError":"keep-original","repeat":false,"repeatCount":10,"description":"Text transform on cells in column Número de Cliente using expression grel:value.substring(0, value.length()-2)"}},{"description":"Text transform on 99 cells in column Nombre Campaña: grel:value.substring(4)","operation":{"op":"core/text-transform","engineConfig":{"facets":[],"mode":"row-based"},"columnName":"Nombre Campaña","expression":"grel:value.substring(4)","onError":"keep-original","repeat":false,"repeatCount":10,"description":"Text transform on cells in column Nombre Campaña using expression grel:value.substring(4)"}}]}

I've tried to use the ftfy module but it doesn't work. Any idea how I can fix it?

CodePudding user response:

Use the right encoding when reading the file, and use ensure_ascii=False when writing the JSON to see human-readable non-ASCII characters. It is still valid JSON as ASCII it is just a visual issue:

import json

with open('get-operations.json', encoding='utf-8-sig') as j:
    data = json.load(j)
new_data = []
for dic in data['entries']:
    for key,value in dic.items():
        if key == 'operation':
            new_data.append(value)  # original line was over complicated :)

x = json.dumps(new_data, ensure_ascii=False)
print(x)

Output:

[{"op": "core/column-reorder", "columnNames": ["Número de Cliente", "[Account.AccountCode?]", "CRM Origen", "Monto inicial", "[ModInitCta?]", "[DeudaRealCuenta?]", "[BillCycleName?]", "Nombre Campaña", "Nombre Casa de Cobro", "Fecha de Asignacion", "Deuda Gestionable", "Dirección Completa", "Fecha Final", "Email", "Telefono 1", "Telefono 2", "Telefono 3", "Telefono 4", "Segmento", "[Documento?]", "[AccStsName?]", "Ciudad", "[InboxName?]", "Nombre del Cliente", "Id de Ejecucion", "Fecha de Vencimiento", "Numero Referencia de Pago", "MIN", "Plan", "Precio Subscripcion"], "description": "Reorder columns"}, {"op": "core/column-addition", "engineConfig": {"facets": [], "mode": "row-based"}, "baseColumnName": "Telefono 1", "expression": "join ([coalesce(cells['Telefono 1'].value,''),coalesce(cells['Telefono 2'].value,''),coalesce(cells['Telefono 3'].value,''),coalesce(cells['Telefono 4'].value,'')],' / ')", "onError": "keep-original", "newColumnName": "Telefonos", "columnInsertIndex": 15, "description": "Create column Telefonos at index 15 based on column Telefono 1 using expression join ([coalesce(cells['Telefono 1'].value,''),coalesce(cells['Telefono 2'].value,''),coalesce(cells['Telefono 3'].value,''),coalesce(cells['Telefono 4'].value,'')],' / ')"}, {"op": "core/column-reorder", "columnNames": ["Número de Cliente", "[Account.AccountCode?]", "CRM Origen", "Monto inicial", "[ModInitCta?]", "[DeudaRealCuenta?]", "[BillCycleName?]", "Nombre Campaña", "Nombre Casa de Cobro", "Fecha de Asignacion", "Deuda Gestionable", "Dirección Completa", "Fecha Final", "Email", "Telefonos", "Segmento", "[Documento?]", "[AccStsName?]", "Ciudad", "[InboxName?]", "Nombre del Cliente", "Id de Ejecucion", "Fecha de Vencimiento", "Numero Referencia de Pago", "MIN", "Plan", "Precio Subscripcion"], "description": "Reorder columns"}, {"op": "core/column-reorder", "columnNames": ["Número de Cliente", "[Account.AccountCode?]", "CRM Origen", "Monto inicial", "[ModInitCta?]", "[DeudaRealCuenta?]", "[BillCycleName?]", "Nombre Campaña", "Nombre Casa de Cobro", "Fecha de Asignacion", "Deuda Gestionable", "Dirección Completa", "Fecha Final", "Email", "Telefonos", "Segmento", "Ciudad", "[InboxName?]", "Nombre del Cliente", "Id de Ejecucion", "Fecha de Vencimiento", "Numero Referencia de Pago", "Plan", "Precio Subscripcion"], "description": "Reorder columns"}, {"op": "core/text-transform", "engineConfig": {"facets": [], "mode": "row-based"}, "columnName": "Monto inicial", "expression": "value.toNumber()", "onError": "keep-original", "repeat": false, "repeatCount": 10, "description": "Text transform on cells in column Monto inicial using expression value.toNumber()"}, {"op": "core/text-transform", "engineConfig": {"facets": [], "mode": "row-based"}, "columnName": "Dirección Completa", "expression": "value.toTitlecase()", "onError": "keep-original", "repeat": false, "repeatCount": 10, "description": "Text transform on cells in column Dirección Completa using expression value.toTitlecase()"}, {"op": "core/text-transform", "engineConfig": {"facets": [], "mode": "row-based"}, "columnName": "Número de Cliente", "expression": "grel:value.substring(0, value.length()-2)", "onError": "keep-original", "repeat": false, "repeatCount": 10, "description": "Text transform on cells in column Número de Cliente using expression grel:value.substring(0, value.length()-2)"}, {"op": "core/text-transform", "engineConfig": {"facets": [], "mode": "row-based"}, "columnName": "Nombre Campaña", "expression": "grel:value.substring(4)", "onError": "keep-original", "repeat": false, "repeatCount": 10, "description": "Text transform on cells in column Nombre Campaña using expression grel:value.substring(4)"}]

You can also use indent=2 in the json.dumps() to get "pretty" output:

[
  {
    "op": "core/column-reorder",
    "columnNames": [
      "Número de Cliente",
      "[Account.AccountCode?]",
      "CRM Origen",
      "Monto inicial",
      "[ModInitCta?]",
      "[DeudaRealCuenta?]",
      "[BillCycleName?]",
      "Nombre Campaña",
      "Nombre Casa de Cobro",
      "Fecha de Asignacion",
      "Deuda Gestionable",
      "Dirección Completa",
      "Fecha Final",
      "Email",
      "Telefono 1",
      "Telefono 2",
      "Telefono 3",
      "Telefono 4",
      "Segmento",
      "[Documento?]",
      "[AccStsName?]",
      "Ciudad",
      "[InboxName?]",
      "Nombre del Cliente",
      "Id de Ejecucion",
      "Fecha de Vencimiento",
      "Numero Referencia de Pago",
      "MIN",
      "Plan",
      "Precio Subscripcion"
    ],
    "description": "Reorder columns"
  },
  {
    "op": "core/column-addition",
    "engineConfig": {
      "facets": [],
      "mode": "row-based"
    },
    "baseColumnName": "Telefono 1",
    "expression": "join ([coalesce(cells['Telefono 1'].value,''),coalesce(cells['Telefono 2'].value,''),coalesce(cells['Telefono 3'].value,''),coalesce(cells['Telefono 4'].value,'')],' / ')",
    "onError": "keep-original",
    "newColumnName": "Telefonos",
    "columnInsertIndex": 15,
    "description": "Create column Telefonos at index 15 based on column Telefono 1 using expression join ([coalesce(cells['Telefono 1'].value,''),coalesce(cells['Telefono 2'].value,''),coalesce(cells['Telefono 3'].value,''),coalesce(cells['Telefono 4'].value,'')],' / ')"
  },
  {
    "op": "core/column-reorder",
    "columnNames": [
      "Número de Cliente",
      "[Account.AccountCode?]",
      "CRM Origen",
      "Monto inicial",
      "[ModInitCta?]",
      "[DeudaRealCuenta?]",
      "[BillCycleName?]",
      "Nombre Campaña",
      "Nombre Casa de Cobro",
      "Fecha de Asignacion",
      "Deuda Gestionable",
      "Dirección Completa",
      "Fecha Final",
      "Email",
      "Telefonos",
      "Segmento",
      "[Documento?]",
      "[AccStsName?]",
      "Ciudad",
      "[InboxName?]",
      "Nombre del Cliente",
      "Id de Ejecucion",
      "Fecha de Vencimiento",
      "Numero Referencia de Pago",
      "MIN",
      "Plan",
      "Precio Subscripcion"
    ],
    "description": "Reorder columns"
  },
  {
    "op": "core/column-reorder",
    "columnNames": [
      "Número de Cliente",
      "[Account.AccountCode?]",
      "CRM Origen",
      "Monto inicial",
      "[ModInitCta?]",
      "[DeudaRealCuenta?]",
      "[BillCycleName?]",
      "Nombre Campaña",
      "Nombre Casa de Cobro",
      "Fecha de Asignacion",
      "Deuda Gestionable",
      "Dirección Completa",
      "Fecha Final",
      "Email",
      "Telefonos",
      "Segmento",
      "Ciudad",
      "[InboxName?]",
      "Nombre del Cliente",
      "Id de Ejecucion",
      "Fecha de Vencimiento",
      "Numero Referencia de Pago",
      "Plan",
      "Precio Subscripcion"
    ],
    "description": "Reorder columns"
  },
  {
    "op": "core/text-transform",
    "engineConfig": {
      "facets": [],
      "mode": "row-based"
    },
    "columnName": "Monto inicial",
    "expression": "value.toNumber()",
    "onError": "keep-original",
    "repeat": false,
    "repeatCount": 10,
    "description": "Text transform on cells in column Monto inicial using expression value.toNumber()"
  },
  {
    "op": "core/text-transform",
    "engineConfig": {
      "facets": [],
      "mode": "row-based"
    },
    "columnName": "Dirección Completa",
    "expression": "value.toTitlecase()",
    "onError": "keep-original",
    "repeat": false,
    "repeatCount": 10,
    "description": "Text transform on cells in column Dirección Completa using expression value.toTitlecase()"
  },
  {
    "op": "core/text-transform",
    "engineConfig": {
      "facets": [],
      "mode": "row-based"
    },
    "columnName": "Número de Cliente",
    "expression": "grel:value.substring(0, value.length()-2)",
    "onError": "keep-original",
    "repeat": false,
    "repeatCount": 10,
    "description": "Text transform on cells in column Número de Cliente using expression grel:value.substring(0, value.length()-2)"
  },
  {
    "op": "core/text-transform",
    "engineConfig": {
      "facets": [],
      "mode": "row-based"
    },
    "columnName": "Nombre Campaña",
    "expression": "grel:value.substring(4)",
    "onError": "keep-original",
    "repeat": false,
    "repeatCount": 10,
    "description": "Text transform on cells in column Nombre Campaña using expression grel:value.substring(4)"
  }
]
  • Related