Skip to content

Commit 8860f76

Browse files
author
Timothy Jennison
committed
Add support for rollup instances level hints
This allows survey versions to be rolled up to questions (and topics). Currently there is only support for a single hierarchy but that limitation exists elsewhere already. This will require re-indexing once the config has been updated.
1 parent 02dce51 commit 8860f76

File tree

12 files changed

+199
-40
lines changed

12 files changed

+199
-40
lines changed

docs/generated/UNDERLAY_CONFIG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -860,6 +860,11 @@ Names of attributes that we want to calculate instance-level hints for.
860860

861861
Instance-level hints are ranges of possible values for a particular criteria instance. They are used to support criteria-specific modifiers (e.g. range of values for measurement code "glucose test").
862862

863+
### SZOccurrenceEntity.attributesWithRollupInstanceLevelHints
864+
**required** Set [ String ]
865+
866+
Names of attributes that we want to calculate instance-level hints for which values should be rolled up and included in their ancestors hints as well.
867+
863868
### SZOccurrenceEntity.criteriaRelationship
864869
**required** [SZCriteriaRelationship](#szcriteriarelationship)
865870

indexer/src/main/java/bio/terra/tanagra/indexing/JobSequencer.java

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import bio.terra.tanagra.underlay.Underlay;
2222
import bio.terra.tanagra.underlay.entitymodel.Attribute;
2323
import bio.terra.tanagra.underlay.entitymodel.Entity;
24+
import bio.terra.tanagra.underlay.entitymodel.Hierarchy;
2425
import bio.terra.tanagra.underlay.entitymodel.Relationship;
2526
import bio.terra.tanagra.underlay.entitymodel.entitygroup.CriteriaOccurrence;
2627
import bio.terra.tanagra.underlay.entitymodel.entitygroup.EntityGroup;
@@ -421,6 +422,12 @@ public static SequencedJobSet getJobSetForCriteriaOccurrence(
421422
// TODO: Handle >1 occurrence entity.
422423
Entity occurrenceEntity = criteriaOccurrence.getOccurrenceEntities().get(0);
423424
if (criteriaOccurrence.hasInstanceLevelDisplayHints(occurrenceEntity)) {
425+
// TODO: Handle >1 hierarchy.
426+
Hierarchy hierarchy =
427+
criteriaOccurrence.getCriteriaEntity().hasHierarchies()
428+
? criteriaOccurrence.getCriteriaEntity().getHierarchies().get(0)
429+
: null;
430+
424431
Relationship occurrenceCriteriaRelationship =
425432
criteriaOccurrence.getOccurrenceCriteriaRelationship(occurrenceEntity.getName());
426433
Relationship occurrencePrimaryRelationship =
@@ -458,7 +465,14 @@ public static SequencedJobSet getJobSetForCriteriaOccurrence(
458465
.getInstanceLevelDisplayHints(
459466
criteriaOccurrence.getName(),
460467
occurrenceEntity.getName(),
461-
criteriaOccurrence.getCriteriaEntity().getName())));
468+
criteriaOccurrence.getCriteriaEntity().getName()),
469+
hierarchy,
470+
hierarchy != null
471+
? underlay
472+
.getIndexSchema()
473+
.getHierarchyAncestorDescendant(
474+
criteriaOccurrence.getCriteriaEntity().getName(), hierarchy.getName())
475+
: null));
462476
}
463477

464478
if (criteriaOccurrence.getCriteriaEntity().hasHierarchies()) {

indexer/src/main/java/bio/terra/tanagra/indexing/job/dataflow/WriteInstanceLevelDisplayHints.java

Lines changed: 47 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,25 +3,30 @@
33
import bio.terra.tanagra.api.shared.DataType;
44
import bio.terra.tanagra.indexing.job.BigQueryJob;
55
import bio.terra.tanagra.indexing.job.dataflow.beam.BigQueryBeamUtils;
6+
import bio.terra.tanagra.indexing.job.dataflow.beam.CountUtils;
67
import bio.terra.tanagra.indexing.job.dataflow.beam.DataflowUtils;
78
import bio.terra.tanagra.query.sql.SqlField;
89
import bio.terra.tanagra.query.sql.SqlQueryField;
910
import bio.terra.tanagra.underlay.entitymodel.Attribute;
1011
import bio.terra.tanagra.underlay.entitymodel.Entity;
12+
import bio.terra.tanagra.underlay.entitymodel.Hierarchy;
1113
import bio.terra.tanagra.underlay.entitymodel.Relationship;
1214
import bio.terra.tanagra.underlay.entitymodel.entitygroup.CriteriaOccurrence;
1315
import bio.terra.tanagra.underlay.indextable.ITEntityMain;
16+
import bio.terra.tanagra.underlay.indextable.ITHierarchyAncestorDescendant;
1417
import bio.terra.tanagra.underlay.indextable.ITInstanceLevelDisplayHints;
1518
import bio.terra.tanagra.underlay.indextable.ITRelationshipIdPairs;
1619
import bio.terra.tanagra.underlay.serialization.SZIndexer;
1720
import com.google.api.services.bigquery.model.TableRow;
1821
import jakarta.annotation.Nullable;
1922
import java.io.Serializable;
23+
import java.util.stream.StreamSupport;
2024
import org.apache.beam.sdk.Pipeline;
2125
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
2226
import org.apache.beam.sdk.transforms.Count;
2327
import org.apache.beam.sdk.transforms.Distinct;
2428
import org.apache.beam.sdk.transforms.Filter;
29+
import org.apache.beam.sdk.transforms.FlatMapElements;
2530
import org.apache.beam.sdk.transforms.MapElements;
2631
import org.apache.beam.sdk.transforms.Max;
2732
import org.apache.beam.sdk.transforms.Min;
@@ -48,6 +53,8 @@ public class WriteInstanceLevelDisplayHints extends BigQueryJob {
4853
private final @Nullable ITRelationshipIdPairs occurrenceCriteriaRelationshipIdPairsTable;
4954
private final @Nullable ITRelationshipIdPairs occurrencePrimaryRelationshipIdPairsTable;
5055
private final ITInstanceLevelDisplayHints indexTable;
56+
private final @Nullable Hierarchy hierarchy;
57+
private final @Nullable ITHierarchyAncestorDescendant ancestorDescendantTable;
5158

5259
@SuppressWarnings("checkstyle:ParameterNumber")
5360
public WriteInstanceLevelDisplayHints(
@@ -59,7 +66,9 @@ public WriteInstanceLevelDisplayHints(
5966
ITEntityMain primaryEntityIndexTable,
6067
@Nullable ITRelationshipIdPairs occurrenceCriteriaRelationshipIdPairsTable,
6168
@Nullable ITRelationshipIdPairs occurrencePrimaryRelationshipIdPairsTable,
62-
ITInstanceLevelDisplayHints indexTable) {
69+
ITInstanceLevelDisplayHints indexTable,
70+
@Nullable Hierarchy hierarchy,
71+
@Nullable ITHierarchyAncestorDescendant ancestorDescendantTable) {
6372
super(indexerConfig);
6473
this.criteriaOccurrence = criteriaOccurrence;
6574
this.occurrenceEntity = occurrenceEntity;
@@ -69,6 +78,8 @@ public WriteInstanceLevelDisplayHints(
6978
this.occurrenceCriteriaRelationshipIdPairsTable = occurrenceCriteriaRelationshipIdPairsTable;
7079
this.occurrencePrimaryRelationshipIdPairsTable = occurrencePrimaryRelationshipIdPairsTable;
7180
this.indexTable = indexTable;
81+
this.hierarchy = hierarchy;
82+
this.ancestorDescendantTable = ancestorDescendantTable;
7283
}
7384

7485
@Override
@@ -119,8 +130,8 @@ public void run(boolean isDryRun) {
119130
readInRelationshipIdPairs(
120131
pipeline, occCriIdPairsSql, entityAIdColumnName, entityBIdColumnName);
121132

122-
// Build a query to select all occurrence-criteria id pairs, and the pipeline steps to read the
123-
// results and build a (occurrence id, criteria id) KV PCollection.
133+
// Build a query to select all occurrence-primary id pairs, and the pipeline steps to read the
134+
// results and build a (occurrence id, primary id) KV PCollection.
124135
String occPriIdPairsSql =
125136
getQueryRelationshipIdPairs(
126137
entityAIdColumnName,
@@ -134,17 +145,32 @@ public void run(boolean isDryRun) {
134145
readInRelationshipIdPairs(
135146
pipeline, occPriIdPairsSql, entityAIdColumnName, entityBIdColumnName);
136147

148+
PCollection<KV<Long, Long>> rollupOccCriIdPairKVs = null;
149+
if (hierarchy != null
150+
&& criteriaOccurrence.hasRollupInstanceLevelDisplayHints(occurrenceEntity)) {
151+
PCollection<KV<Long, Long>> descendantAncestorRelationshipsPC =
152+
BigQueryBeamUtils.readDescendantAncestorRelationshipsFromBQ(
153+
pipeline, ancestorDescendantTable);
154+
155+
// Expand the set of occurrences to include a repeat for each ancestor.
156+
rollupOccCriIdPairKVs =
157+
CountUtils.repeatOccurrencesForHints(occCriIdPairKVs, descendantAncestorRelationshipsPC);
158+
}
159+
final PCollection<KV<Long, Long>> finalRollupOccCriIdPairKVs = rollupOccCriIdPairKVs;
160+
137161
criteriaOccurrence
138162
.getAttributesWithInstanceLevelDisplayHints(occurrenceEntity)
139163
.forEach(
140-
attribute -> {
164+
(attribute, rollup) -> {
165+
PCollection<KV<Long, Long>> idPairsKVs =
166+
rollup ? finalRollupOccCriIdPairKVs : occCriIdPairKVs;
141167
if (attribute.isValueDisplay()) {
142168
LOGGER.info("enum val hint: {}", attribute.getName());
143-
enumValHint(occCriIdPairKVs, occPriIdPairKVs, occIdRowKVs, attribute);
169+
enumValHint(idPairsKVs, occPriIdPairKVs, occIdRowKVs, attribute);
144170
} else if (DataType.INT64.equals(attribute.getDataType())
145171
|| DataType.DOUBLE.equals(attribute.getDataType())) {
146172
LOGGER.info("numeric range hint: {}", attribute.getName());
147-
numericRangeHint(occCriIdPairKVs, occIdRowKVs, attribute);
173+
numericRangeHint(idPairsKVs, occIdRowKVs, attribute);
148174
} // TODO: Calculate display hints for other data types.
149175
});
150176

@@ -287,13 +313,15 @@ private void numericRangeHint(
287313
occIdAndNumValCriId
288314
.apply(Filter.by(cogb -> cogb.getValue().getAll(numValTag).iterator().hasNext()))
289315
.apply(
290-
MapElements.into(
316+
FlatMapElements.into(
291317
TypeDescriptors.kvs(TypeDescriptors.longs(), TypeDescriptors.doubles()))
292318
.via(
293-
cogb ->
294-
KV.of(
295-
cogb.getValue().getOnly(criIdTag),
296-
cogb.getValue().getOnly(numValTag))));
319+
cogb -> {
320+
Iterable<Long> criIds = cogb.getValue().getAll(criIdTag);
321+
return StreamSupport.stream(criIds.spliterator(), false)
322+
.map((Long criId) -> KV.of(criId, cogb.getValue().getOnly(numValTag)))
323+
.toList();
324+
}));
297325

298326
// Compute numeric range for each criteriaId.
299327
PCollection<IdNumericRange> numericRanges = numericRangeHint(criteriaValuePairs);
@@ -361,23 +389,28 @@ private void enumValHint(
361389
.and(criIdTag, occCriIdPairs)
362390
.and(priIdTag, occPriIdPairs)
363391
.apply(CoGroupByKey.create());
392+
364393
PCollection<KV<IdEnumValue, Long>> criteriaEnumPrimaryPairs =
365394
occIdAndAttrsCriIdPriId
366395
.apply(Filter.by(cogb -> cogb.getValue().getAll(occAttrsTag).iterator().hasNext()))
367396
.apply(
368-
MapElements.into(
397+
FlatMapElements.into(
369398
TypeDescriptors.kvs(
370399
new TypeDescriptor<IdEnumValue>() {}, TypeDescriptors.longs()))
371400
.via(
372401
cogb -> {
373-
Long criId = cogb.getValue().getOnly(criIdTag);
402+
Iterable<Long> criIds = cogb.getValue().getAll(criIdTag);
374403
Long priId = cogb.getValue().getOnly(priIdTag);
375404

376405
TableRow occAttrs = cogb.getValue().getOnly(occAttrsTag);
377406
String enumValue = (String) occAttrs.get(enumValColName);
378407
String enumDisplay = (String) occAttrs.get(enumDisplayColName);
379408

380-
return KV.of(new IdEnumValue(criId, enumValue, enumDisplay), priId);
409+
return StreamSupport.stream(criIds.spliterator(), false)
410+
.map(
411+
(Long criId) ->
412+
KV.of(new IdEnumValue(criId, enumValue, enumDisplay), priId))
413+
.toList();
381414
}));
382415

383416
// Compute enum values and counts for each criteriaId.

indexer/src/main/java/bio/terra/tanagra/indexing/job/dataflow/WriteRollupCounts.java

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -263,21 +263,13 @@ private void writeFieldsToTempTable(boolean isDryRun) {
263263

264264
// Optionally handle a hierarchy for the rollup entity.
265265
if (hierarchy != null) {
266-
// Build a query to select all ancestor-descendant pairs from the ancestor-descendant table,
267-
// and the pipeline step to read the results.
268-
String ancestorDescendantSql =
269-
"SELECT * FROM " + ancestorDescendantTable.getTablePointer().render();
270-
LOGGER.info("ancestor-descendant query: {}", ancestorDescendantSql);
271-
PCollection<KV<Long, Long>> ancestorDescendantRelationshipsPC =
272-
BigQueryBeamUtils.readTwoFieldRowsFromBQ(
273-
pipeline,
274-
ancestorDescendantSql,
275-
ITHierarchyAncestorDescendant.Column.DESCENDANT.getSchema().getColumnName(),
276-
ITHierarchyAncestorDescendant.Column.ANCESTOR.getSchema().getColumnName());
266+
PCollection<KV<Long, Long>> descendantAncestorRelationshipsPC =
267+
BigQueryBeamUtils.readDescendantAncestorRelationshipsFromBQ(
268+
pipeline, ancestorDescendantTable);
277269

278270
// Expand the set of occurrences to include a repeat for each ancestor.
279271
idPairsPC =
280-
CountUtils.repeatOccurrencesForHierarchy(idPairsPC, ancestorDescendantRelationshipsPC);
272+
CountUtils.repeatOccurrencesForHierarchy(idPairsPC, descendantAncestorRelationshipsPC);
281273
}
282274

283275
// Count the number of distinct occurrences per entity id.

indexer/src/main/java/bio/terra/tanagra/indexing/job/dataflow/beam/BigQueryBeamUtils.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import bio.terra.tanagra.api.shared.DataType;
44
import bio.terra.tanagra.exception.SystemException;
55
import bio.terra.tanagra.underlay.ColumnSchema;
6+
import bio.terra.tanagra.underlay.indextable.ITHierarchyAncestorDescendant;
67
import com.google.api.services.bigquery.model.TableFieldSchema;
78
import com.google.api.services.bigquery.model.TableRow;
89
import com.google.api.services.bigquery.model.TableSchema;
@@ -20,8 +21,11 @@
2021
import org.apache.beam.sdk.values.PCollection;
2122
import org.apache.beam.sdk.values.TypeDescriptors;
2223
import org.apache.commons.text.StringSubstitutor;
24+
import org.slf4j.Logger;
25+
import org.slf4j.LoggerFactory;
2326

2427
public final class BigQueryBeamUtils {
28+
private static final Logger LOGGER = LoggerFactory.getLogger(BigQueryBeamUtils.class);
2529

2630
private BigQueryBeamUtils() {}
2731

@@ -65,6 +69,22 @@ public static PCollection<KV<Long, Long>> readTwoFieldRowsFromBQ(
6569
}));
6670
}
6771

72+
/**
73+
* Build a query to select all descendant-ancestor pairs from the ancestor-descendant table, and
74+
* the pipeline step to read the results.
75+
*/
76+
public static PCollection<KV<Long, Long>> readDescendantAncestorRelationshipsFromBQ(
77+
Pipeline pipeline, ITHierarchyAncestorDescendant ancestorDescendantTable) {
78+
String descendantAncestorSql =
79+
"SELECT * FROM " + ancestorDescendantTable.getTablePointer().render();
80+
LOGGER.info("descendant-ancestor query: {}", descendantAncestorSql);
81+
return BigQueryBeamUtils.readTwoFieldRowsFromBQ(
82+
pipeline,
83+
descendantAncestorSql,
84+
ITHierarchyAncestorDescendant.Column.DESCENDANT.getSchema().getColumnName(),
85+
ITHierarchyAncestorDescendant.Column.ANCESTOR.getSchema().getColumnName());
86+
}
87+
6888
public static String getTableSqlPath(String projectId, String datasetId, String tableName) {
6989
final String template = "${projectId}:${datasetId}.${tableName}";
7090
Map<String, String> params =

indexer/src/main/java/bio/terra/tanagra/indexing/job/dataflow/beam/CountUtils.java

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import org.apache.beam.sdk.transforms.Count;
55
import org.apache.beam.sdk.transforms.Distinct;
66
import org.apache.beam.sdk.transforms.Flatten;
7+
import org.apache.beam.sdk.transforms.KvSwap;
78
import org.apache.beam.sdk.transforms.MapElements;
89
import org.apache.beam.sdk.transforms.SimpleFunction;
910
import org.apache.beam.sdk.transforms.Values;
@@ -136,4 +137,53 @@ public static PCollection<KV<Long, Long>> repeatOccurrencesForHierarchy(
136137
.and(ancestorOccurrences)
137138
.apply(Flatten.pCollections());
138139
}
140+
141+
/**
142+
* For each occurrence (occurrence, criteria), generate a new occurrence for each ancestor of the
143+
* criteria node (occurrence, ancestor).
144+
*
145+
* <p>This is the same concept as repeatOccurrencesForHierarchy but over occurrence ids.
146+
*
147+
* @param occurrences a collection of all occurrences that we want to count and the criteria
148+
* they're associated with
149+
* @param descendantAncestor a collection of (descendant, ancestor) pairs for the criteria nodes
150+
* that we want a count for. note that this is the expanded set of all transitive
151+
* relationships in the hierarchy, not just the parent/child pairs
152+
* @return an expanded collection of occurrences (occurrence, ancestor), where each occurrence has
153+
* been repeated for each ancestor of its primary node. note for later steps that this will
154+
* contain multiple keys
155+
*/
156+
public static PCollection<KV<Long, Long>> repeatOccurrencesForHints(
157+
PCollection<KV<Long, Long>> occurrences, PCollection<KV<Long, Long>> descendantAncestor) {
158+
// Remove duplicate occurrences.
159+
PCollection<KV<Long, Long>> distinctOccurrences =
160+
occurrences.apply(
161+
"remove duplicate occurrences before repeating for hints", Distinct.create());
162+
163+
// Swap (occurrence, criteria) to (criteria, occurrence). Duplicate keys are allowed at this
164+
// point.
165+
PCollection<KV<Long, Long>> criteriaOccurrences =
166+
distinctOccurrences.apply(
167+
"swap (occurrence, criteria) to (criteria, occurrence)", KvSwap.create());
168+
169+
// JOIN: distinctOccurrences (criteria, occurrence) INNER JOIN descendantAncestor (descendant,
170+
// ancestor)
171+
// ON criteria=descendant
172+
// RESULT: occurrenceToAncestorAndOccurrence (criteria=descendant, (occurrence, ancestor))
173+
PCollection<KV<Long, KV<Long, Long>>> criteriaToOccurrenceAndAncestor =
174+
Join.innerJoin(
175+
"inner join occurrences with ancestors", criteriaOccurrences, descendantAncestor);
176+
177+
// Get rid of the descendant node. That was only needed as the innerJoin field.
178+
// RESULT: (occurrence, ancestor)
179+
PCollection<KV<Long, Long>> occurrenceAncestors =
180+
criteriaToOccurrenceAndAncestor.apply(Values.create());
181+
182+
// The descendant-ancestor pairs don't include a self-reference row (i.e. descendant=ancestor).
183+
// So to get the full set of occurrences, concatenate the original occurrences with the ancestor
184+
// duplicates.
185+
return PCollectionList.of(distinctOccurrences)
186+
.and(occurrenceAncestors)
187+
.apply(Flatten.pCollections());
188+
}
139189
}

ui/src/tanagra-underlay/underlayConfig.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ export type SZMetadata = {
152152

153153
export type SZOccurrenceEntity = {
154154
attributesWithInstanceLevelHints: string[];
155+
attributesWithRollupInstanceLevelHints: string[];
155156
criteriaRelationship: SZCriteriaRelationship;
156157
occurrenceEntity: string;
157158
primaryRelationship: SZPrimaryRelationship;

underlay/src/main/java/bio/terra/tanagra/underlay/ConfigReader.java

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -321,11 +321,16 @@ private SZCriteriaOccurrence deserializeCriteriaOccurrence(String criteriaOccurr
321321
? new HashSet<>()
322322
: szCriteriaOccurrence.occurrenceEntities;
323323
szCriteriaOccurrence.occurrenceEntities.forEach(
324-
szOccurrenceEntity ->
325-
szOccurrenceEntity.attributesWithInstanceLevelHints =
326-
szOccurrenceEntity.attributesWithInstanceLevelHints == null
327-
? new HashSet<>()
328-
: szOccurrenceEntity.attributesWithInstanceLevelHints);
324+
szOccurrenceEntity -> {
325+
szOccurrenceEntity.attributesWithInstanceLevelHints =
326+
szOccurrenceEntity.attributesWithInstanceLevelHints == null
327+
? new HashSet<>()
328+
: szOccurrenceEntity.attributesWithInstanceLevelHints;
329+
szOccurrenceEntity.attributesWithRollupInstanceLevelHints =
330+
szOccurrenceEntity.attributesWithRollupInstanceLevelHints == null
331+
? new HashSet<>()
332+
: szOccurrenceEntity.attributesWithRollupInstanceLevelHints;
333+
});
329334

330335
return szCriteriaOccurrence;
331336
} catch (IOException ioEx) {

0 commit comments

Comments
 (0)