It looks like the query shown below is called for insert_condition_occurrence.sql and was causing the poor performance (2 hours to execute).
select
row_number()over(order by p.person_id) condition_occurrence_id,
p.person_id person_id,
srctostdvm.target_concept_id condition_concept_id,
c.start condition_start_date,
c.start condition_start_datetime,
c.stop condition_end_date,
c.stop condition_end_datetime,
38000175 condition_type_concept_id,
cast(null as varchar) stop_reason,
pr.provider_id provider_id,
fv.visit_occurrence_id_new visit_occurrence_id,
fv.visit_occurrence_id_new + 1000000 visit_detail_id,
c.code condition_source_value,
srctosrcvm.source_concept_id condition_source_concept_id,
null condition_status_source_value,
0 condition_status_concept_id
from
etl_synthea_1k_synthea_native.conditions c
join
etl_synthea_1k.source_to_standard_vocab_map srctostdvm
on srctostdvm.source_code = c.code
and srctostdvm.target_domain_id = 'Condition'
and srctostdvm.target_vocabulary_id = 'SNOMED'
and srctostdvm.source_vocabulary_id = 'SNOMED'
and srctostdvm.target_standard_concept = 'S'
and srctostdvm.target_invalid_reason is null
join
etl_synthea_1k.source_to_source_vocab_map srctosrcvm
on srctosrcvm.source_code = c.code
and srctosrcvm.source_vocabulary_id = 'SNOMED'
and srctosrcvm.source_domain_id = 'Condition'
left join
etl_synthea_1k.final_visit_ids fv
on fv.encounter_id = c.encounter
left join
etl_synthea_1k_synthea_native.encounters e
on c.encounter = e.id
and c.patient = e.patient
left join
etl_synthea_1k.provider pr
on e.provider = pr.provider_source_value
join
etl_synthea_1k.person p
on c.patient = p.person_source_value
Adding the indexes shown below and including all of the indexes created from the CDM script OMOPCDM_postgresql_5.4_indices.sql resolved this issue for me.
I’m currently adding the indexes by hand. To do this I needed to break up LoadEventTables into two methods CreateVocabTables and LoadEventTables. The calls to CreateVocabMapTables and CreateVisitRollupTables that were in LoadEventTables were moved to the new method. This code has been checked into a NACHC-CAD fork at https://github.com/NACHC-CAD/ETL-Synthea.
create index source_to_standard_vocab_map_sctditvisvitsctir on source_to_standard_vocab_map(
source_code,
target_domain_id,
target_vocabulary_id,
source_vocabulary_id,
target_standard_concept,
target_invalid_reason
);
create index source_to_standard_vocab_map_scsvsd on source_to_standard_vocab_map (
source_code,
source_vocabulary_id,
source_domain_id
);
create index source_to_source_vocab_map_scsvisdi on source_to_source_vocab_map (
source_code,
source_vocabulary_id,
source_domain_id
);
create index provider_psv on provider (
provider_source_value
);
create index person_psv on person (
person_source_value
);