I need to analyze a large table with hundreds of columns. A lot of columns are unused.
To investigate I could do something like
SELECT DISTINCT Column1
FROM myTable
or
WITH C AS
(
SELECT DISTINCT Column1
FROM MyTable
)
SELECT COUNT(*)
FROM C
Then I do the same for column2 and so on. However these queries only work for one column which is time consuming and does not give overview in one glance.
Any idea how to build such investigation query for all columns in one?
CodePudding user response:
You need only 1 query where you have to list all the columns of the table:
SELECT COUNT(DISTINCT Column1) column1_count,
COUNT(DISTINCT Column2) column2_count,
COUNT(DISTINCT Column3) column3_count
.....................................
FROM MyTable;
CodePudding user response:
For local purposes only, you can make it dynamic like this:
Get the columns of the table the query is created as the colleagues did and then it is executed with the EXEC()
DECLARE @columns as Table(RowId INT IDENTITY(1,1), ColumnName nVarchar(50))
DECLARE @ii int = 0
DECLARE @max int = 0
DECLARE @sqlQuery nVarchar(MAX)
INSERT INTO @columns
SELECT COLUMN_NAME
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = N'Customer'
SET @sqlQuery = 'SELECT '
SELECT @max = COUNT(*) FROM @columns
WHILE @ii <= @max
BEGIN
SELECT @sqlQuery = CONCAT(@sqlQuery,'COUNT(DISTINCT ',ColumnName,') ',LOWER(ColumnName),'_count, ')
FROM @columns
WHERE RowId = @ii
SET @ii = @ii 1
END
SELECT @sqlQuery = CONCAT(@sqlQuery,'FROM Customer')
SELECT @sqlQuery = REPLACE(@sqlQuery,', FROM',' FROM')
select @sqlQuery
EXEC (@sqlQuery)
CodePudding user response:
You should flesh out your requirement a bit more. If all you want to know is if a column contains only NULLs, you'll want to check for max(ColumnName) is null
declare @sql table (id int identity(1,1), QueryString nvarchar(max))
create table ##emptyColumns (emptyColumn nvarchar(128))
declare @i int = 0
declare @iMax int
declare @runthis nvarchar(max)
insert @sql
select 'select ''' QUOTENAME(s.name) '.' QUOTENAME(o.name) quotename(c.name) ''' as ''column''
from ' QUOTENAME(s.name) '.' QUOTENAME(o.name) '
having max(' c.name ') is null'
from sys.sysobjects o
inner join sys.syscolumns c on c.id = o.id
inner join sys.schemas s on s.schema_id = o.uid
where o.type = 'U'
order by s.name
, o.name
, c.colorder
select @iMax = count(*)
from @sql
print @iMax
while @i < @iMax
begin
set @i = @i 1
select @runthis = 'insert into ##emptyColumns
' QueryString
from @sql
where id = @i
execute sp_executesql @runthis
end
select *
from ##emptyColumns
drop table ##emptyColumns
CodePudding user response:
One further option you might consider:
declare @sql nvarchar(max)
select @sql = isnull(@sql ' union all ', '') 'select ''' COLUMN_NAME ''',
sum(case when ' COLUMN_NAME ' is null then 1 else 0 end) as null_values,
count(distinct ' COLUMN_NAME ') as count_distinct
from ' TABLE_SCHEMA '.' TABLE_NAME '
'
from information_schema.columns
where TABLE_SCHEMA = 'MySchema' and TABLE_NAME = 'MyTable'
exec (@sql)
If you had very big tables with large numbers of columns and were only interested in empty columns you could look into something like checksum_agg(checksum(column_name)). It may help improve performance.
You'd need to be wary of column data types, as they are not all compatible with distinct.