I inherited from a SQL query which is quite slow to run (2-2mins20 in average) while the amount of data isn't huge.
I'm not yet much comfortable with SQL but I did attempt to use temporary table variable in order to have a lookup table to find the region associated to each countries. It didn't change anything regarding the total cost of the query (which is 4780, found via an EXPLAIN in DataGrid).
I'm unsure of how I could make it faster besides the creation of the right indexes.
SELECT CASE
WHEN Sites.country IN
('Australia', 'Hong Kong', 'India', 'Korea, Republic of', 'Malaysia', 'New Zealand', 'Philippines',
'Singapore', 'Taiwan', 'Thailand', 'Viet Nam')
THEN 'APAC'
WHEN Sites.country IN ('China')
THEN 'China'
WHEN Sites.country IN
('Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Czech Republic', 'Denmark', 'Estonia', 'Finland',
'France', 'Germany', 'Greece', 'Hungary', 'Ireland', 'Israel', 'Italy', 'Latvia', 'Lebanon',
'Lithuania', 'Netherlands', 'Norway', 'Poland', 'Portugal', 'Romania', 'Russian Federation',
'Saudi Arabia', 'Serbia', 'South Africa', 'Spain', 'Sweden', 'Switzerland', 'Turkey', 'Ukraine',
'United Arab Emirates', 'United Kingdom')
THEN 'EMEA'
WHEN Sites.country IN ('Japan')
THEN 'Japan'
WHEN Sites.country IN
('Argentina', 'Brazil', 'Chile', 'Colombia', 'Costa Rica', 'Guatemala', 'Mexico', 'Peru', 'Puerto Rico')
THEN 'Latin America'
WHEN Sites.country IN ('Canada', 'United States')
THEN 'North America'
ELSE '?'
END AS 'Region',
Sites.country AS 'Country',
Protocols.number AS 'Protocol #',
Sites.number AS 'Site #',
VisitReports.finalized AS 'Finalized date',
VisitReports.startDate AS 'Start date',
VisitReports.endDate AS 'End date',
VisitReports.visitMode AS 'Mode',
LEN(REPLACE(VisitReports.remoteVisitDates, ',', '')) / 13 AS '# of remote dates',
VisitReports.onSiteVisitDates AS 'On site dates',
VisitReports.remoteVisitDates AS 'Remote dates',
VisitReports.visitId AS 'Visit ID',
CASE
WHEN R2Answer.data LIKE '%choice_yes%' THEN 'Yes'
WHEN R2Answer.data LIKE '%choice_no%' THEN 'No'
ELSE '-'
END AS 'R2',
CASE
WHEN SDR5Answer.data LIKE '%choice_yes%' THEN 'Yes'
WHEN SDR5Answer.data LIKE '%choice_no%' THEN 'No'
ELSE '-'
END AS 'SDR5',
CASE
WHEN SDR5_3Answer.data LIKE '%choice_yes%' THEN 'Yes'
WHEN SDR5_3Answer.data LIKE '%choice_no%' THEN 'No'
ELSE '-'
END AS 'SDR5_3',
CASE
WHEN SDR6Answer.data LIKE '%choice_yes%' THEN 'Yes'
WHEN SDR6Answer.data LIKE '%choice_no%' THEN 'No'
ELSE '-'
END AS 'SDR6',
CASE
WHEN SDR7Answer.data LIKE '%choice_yes%' THEN 'Yes'
WHEN SDR7Answer.data LIKE '%choice_no%' THEN 'No'
WHEN SDR7Answer.data LIKE '%choice_n_a%' THEN 'N/A'
ELSE '-'
END AS 'SDR7',
(SELECT COUNT(*)
FROM [DB].[SCHEMA].ActionItems
WHERE ActionItems.siteId = Sites.id
AND ActionItems.deletedAt IS NULL
AND ActionItems.createdAt > VisitReports.startDate
AND ActionItems.createdAt < VisitReports.finalized) AS '# created ActionItems',
(SELECT COUNT(DISTINCT ActionItems.id)
FROM [DB].[SCHEMA].ActionItemUpdates
JOIN [DB].[SCHEMA].ActionItems
ON ActionItems.id = ActionItemUpdates.actionItemId
AND ActionItems.siteId = Sites.id
WHERE ActionItemUpdates.deletedAt IS NULL
AND ActionItemUpdates.updateType IS NULL
AND ActionItemUpdates.createdAt > VisitReports.startDate
AND ActionItemUpdates.createdAt < VisitReports.finalized) AS '# updated ActionItems',
(SELECT COUNT(*)
FROM [DB].[SCHEMA].ActionItems
WHERE ActionItems.siteId = Sites.id
AND ActionItems.deletedAt IS NULL
AND ActionItems.closed > VisitReports.startDate
AND ActionItems.closed < VisitReports.finalized) AS '# closed ActionItems'
FROM [DB].[SCHEMA].Sites
JOIN [DB].[SCHEMA].Protocols on Protocols.id = Sites.protocolId
JOIN [DB].[SCHEMA].VisitReports ON VisitReports.siteId = Sites.id
AND questionnaireVersion >= 7
AND visitType = 'ongoing'
AND finalized IS NOT NULL
AND VisitReports.endDate > '2022-01-01' AND VisitReports.endDate < '2022-11-11'
LEFT JOIN [DB].[SCHEMA].Answers AS R2Answer ON R2Answer.reportId = VisitReports.id
AND R2Answer.questionId = 'R3'
LEFT JOIN [DB].[SCHEMA].Answers AS SDR5Answer
ON SDR5Answer.reportId = VisitReports.id
AND SDR5Answer.questionId = 'SDR3'
LEFT JOIN [DB].[SCHEMA].Answers AS SDR5_3Answer
ON SDR5_3Answer.reportId = VisitReports.id
AND SDR5_3Answer.questionId = 'SDR3.3'
LEFT JOIN [DB].[SCHEMA].Answers AS SDR6Answer
ON SDR6Answer.reportId = VisitReports.id
AND SDR6Answer.questionId = 'SDR10'
LEFT JOIN [DB].[SCHEMA].Answers AS SDR7Answer
ON SDR7Answer.reportId = VisitReports.id
AND SDR7Answer.questionId = 'SDR4'
ORDER BY [Finalized date]
Thank you in advance for your help!
CodePudding user response:
You can try wth OUTER APPLY
and conditional aggregation (CASE WHEN
inside the aggregation function).
Something like:
SELECT
...
s.country AS [Country],
p.number AS [Protocol #],
s.number AS [Site #],
vr.finalized AS [Finalized date],
vr.startdate AS [Start date],
vr.enddate AS [End date],
vr.visitmode AS [Mode],
LEN(REPLACE(vr.remotevisitdates, ',', '')) / 13 AS [# of remote dates],
vr.onsitevisitdates AS [On site dates],
vr.remotevisitdates AS [Remote dates],
vr.visitId AS [Visit ID],
ans.[R3] AS [Answer R3],
ans.[SDR3] AS [Answer SDR3],
...
act.created AS [# created ActionItems],
act.updated AS [# updated ActionItems],
act.closed AS [# closed ActionItems]
FROM sites s
JOIN protocols p ON p.id = s.protocolid
JOIN visitreports vr ON vr.siteid = s.id
AND vr.questionnaireversion >= 7
AND vr.visittype = 'ongoing'
AND vr.finalized IS NOT NULL
AND vr.enddate > '2022-01-01'
AND vr.enddate < '2022-11-11'
OUTER APPLY
(
SELECT
COUNT(CASE WHEN ai.deletedat IS NULL AND ai.createdat > vr.startdate AND ai.createdat < vr.finalized THEN 1 END) AS created,
COUNT(CASE WHEN ai.deletedat IS NULL AND ai.closed > vr.startdate AND ai.closed < vr.finalized THEN 1 END) AS closed,
COUNT(CASE WHEN EXISTS
(
SELECT NULL
FROM actionitemupdates aiu
WHERE aiu.actionitemid = ai.id
AND aiu.updateType IS NULL
AND aiu.createdAt > vr.startDate
AND aiu.createdAt < vr.finalized
) THEN 1 END) AS updated
FROM actionitems ai
WHERE ai.siteid = s.id
) act
OUTER APPLY
(
SELECT
MAX(CASE WHEN a.questionId = 'R3' THEN
CASE WHEN a.data LIKE '%choice_yes%' THEN 'Yes'
WHEN a.data LIKE '%choice_no%' THEN 'No'
ELSE '-'
END
END) AS [R3],
MAX(CASE WHEN a.questionId = 'SDR3' THEN
CASE WHEN a.data LIKE '%choice_yes%' THEN 'Yes'
WHEN a.data LIKE '%choice_no%' THEN 'No'
ELSE '-'
END
END) AS [SDR3],
...
FROM answers a
WHERE a.reportid = vr.id
) ans
ORDER BY [Finalized date];
In any way you should have indexes on answers(reportid)
and on actionitems(siteid)
to be able to join the answers and action types quickly.
In order to access the YES/NO/- directly, I suggest you add a computed column to your answers table. That will simplyfy above query.
ALTER TABLE answers ADD yes_or_no AS (CASE WHEN data LIKE '%choice_yes%' THEN 'Yes' WHEN data LIKE '%choice_no%' THEN 'No' ELSE '-' END) PERSISTED;
CodePudding user response:
As you stated, a Countries
region lookup table makes the code cleaner. Besides that, I would look for opportunities to reduce data volume from Answers
and ActionItemUpdates
if the table size is significant. After that, aggregate Answers
to reportId
level; ActionItems
and ActionItemUpdates
to siteId
level. Since visitType='ongoing'
, it seems that it's 1-to-1 between the two results. The last step is to join them together to get a site-level report.
with cte_visitReports as (
select id,
siteId,
finalized,
startDate,
endDate,
visitMode,
visitId,
onSiteVisitDates,
remoteVisitDates
from VisitReports
where endDate > '2022-01-01'
and endDate < '2022-11-11'
and questionnaireVersion >= 7
and visitType = 'ongoing'
and finalized IS NOT NULL),
cte_answers as (
select a.reportId,
max(case
when a.questionId='R3' and a.data like '%choice_yes%' then 'Yes'
when a.questionId='R3' and a.data like '%choice_no%' then 'No'
else '-'
end) as R2,
max(case
when a.questionId='SDR3' and a.data like '%choice_yes%' then 'Yes'
when a.questionId='SDR3' and a.data like '%choice_no%' then 'No'
else '-'
end) as SDR5,
max(case
when a.questionId='SDR3.3' and a.data like '%choice_yes%' then 'Yes'
when a.questionId='SDR3.3' and a.data like '%choice_no%' then 'No'
else '-'
end) as SDR5_3,
max(case
when a.questionId='SDR10' and a.data like '%choice_yes%' then 'Yes'
when a.questionId='SDR10' and a.data like '%choice_no%' then 'No'
else '-'
end) as SDR6,
max(case
when a.questionId='SDR4' and a.data like '%choice_yes%' then 'Yes'
when a.questionId='SDR4' and a.data like '%choice_no%' then 'No'
when a.questionId='SDR4' and a.data like '%choice_n_a%' then 'N/A'
else '-'
end) as SDR7
from Answers a
join cte_visitReports r
on a.reportId = r.id
where a.questionId in ('R3','SDR3','SDR3.3','SDR10','SDR4')
-- additional filters such as createdAt if available to reduce data volume
group by a.reportId),
cte_action_items as (
select ai.siteId,
count(distinct case when ai.deletedAt is null and ai.createdAt > r.startDate and ai.createdAt < r.finalized then ai.id end) as created_action_items,
count(distinct case when ai.deletedAt is null and ai.closed > r.startDate and ai.closed < r.finalized then ai.id end) as closed_action_items,
count(distinct case when iu.deletedAt is null and iu.createdAt > r.startDate and iu.createdAt < r.finalized then ai.id end) as updated_action_items
from ActionItems ai
join cte_visitReports r
on ai.siteId = r.siteId
left
join ActionItemUpdates iu
on ai.id = iu.actionItemId
where 1 = 1
-- additional filters if possible ro reduce data volume
group by ai.siteId)
select coalesce(c.region,'?') as 'Region',
s.country as 'Country',
p.number as 'Protocol #',
s.number as 'Site #',
r.finalized as 'Finalized date',
r.startDate as 'Start date',
r.endDate as 'End Date',
r.visitMode as 'Mode',
len(replace(r.remoteVisitDates, ',', '')) / 13 as '# of remote dates',
r.onSiteVisitDates as 'On site dates',
r.remoteVisitDates as 'Remote dates',
r.visitId as 'Visit ID',
coalesce(a.R2,'-') as 'R2',
coalesce(a.SDR5,'-') as 'SDR5',
coalesce(a.SDR5_3,'-') as 'SDR5_3',
coalesce(a.SDR6,'-') as 'SDR6',
coalesce(a.SDR7,'-') as 'SDR7',
coalesce(ai.created_action_items,0) as '# created ActionItems',
coalesce(ai.updated_action_items,0) as '# updated ActionItems',
coalesce(ai.closed_action_items,0) as '# closed ActionItems'
from Sites s
left
join Countries c
on s.country = c.country
join Protocols p
on s.protocolId = p.id
join cte_visitReports r
on s.id = r.siteId
left
join cte_answers a
on r.id = a.reportId
left
join cte_action_items ai
on s.id = ai.siteId
order by r.finalized;
If the report response time is critical, I would try the following approaches:
- Tune the queries
- Cache the result (ie. materialized views)
- Create a datamart for reports and dashboards