I have list derived from JSON as below :
list = ['select', 'name1 = a.column1', 'name2 = a.column2', 'name3 = a.[column3]',
'from', 'xyz.[Table1$Name] c',
'select', 'name2 = b.othercolumn1', 'name2 = b.[othercolumn2]', 'name3 = b.othercolumn3',
'from', 'abc.[Table2$Name] d',
'where', 'x.[TableX] = '123', and so on.....]
The output I want is a data frame with 2 columns which I can extract as excel as below:
Table Name Column Name
Table1$Name column1
Table1$Name column2
Table1$Name column3
Table2$Name othercolumn1
Table2$Name othercolumn2
Table2$Name othercolumn3
I have tried in many ways but I am unable to achieve the desired output, I only want to have excel file output for all columns from this ".bim" file against the respective tables, everything else like where statement, [], c, a., b., should be removed in final output.
CodePudding user response:
- as per comments, you are really asking how to parse SQL into it's components
- this approach uses lark to parse. This is a sophisticated topic, so have demonstrated how to integrate with pandas in another way
- your sample SQL is not valid based on grammar I have used. IMHO this is a decent grammar for SQL. I have worked with many database engines, not one that I have worked with allows assignment in select clause or used of $ in column or table names
from lark import Lark, Tree, Visitor, Tree, Token, UnexpectedInput
import requests
import pandas as pd
grammar = requests.get(
"https://raw.githubusercontent.com/zbrookle/sql_to_ibis/main/sql_to_ibis/grammar/sql.lark"
).text
parser = Lark(grammar, start="query_expr")
class sqlparts(Visitor):
__columns = []
__tables = []
def __init__(self):
self.__columns = []
self.__tables = []
def column_name(self, tree):
self.__columns = [
tok.value
for tok in tree.scan_values(
lambda v: isinstance(v, Token) and v.type == "CNAME"
)
]
def table(self, tree):
self.__tables = [
tok.value
for tok in tree.scan_values(
lambda v: isinstance(v, Token) and v.type == "CNAME"
)
]
def data(self):
return {"columns": self.__columns, "tables": self.__tables}
df = pd.DataFrame(
{
"sql": [
"select col1, col2, col3 from table1, table2 where col7=8",
'select "hello" from a_long_table where col7=8',
'select "hello" from a_long_table where col7=8 groupby col8',
"select col$ from tables",
]
}
)
def applyparse(sql):
d = sqlparts()
try:
t = parser.parse(sql, start="query_expr")
d.visit(t)
return d.data()
except UnexpectedInput as e:
return {"error": e}
df.join(df["sql"].apply(applyparse).apply(pd.Series))
sql | columns | tables | error |
---|---|---|---|
select col1, col2, col3 from table1, table2 where col7=8 | ['col7', 'col1', 'col2', 'col3'] | ['table1', 'table2'] | nan |
select "hello" from a_long_table where col7=8 | ['col7'] | ['a_long_table'] | nan |
select "hello" from a_long_table where col7=8 groupby col8 | ['col7', 'col8'] | ['a_long_table'] | nan |
select col$ from tables | nan | nan | No terminal matches '$' in the current parser context, at line 1 col 11 |