Skip to content

Commit da970ce

Browse files
authored
[db] Introduce a join table between SourceComponent and File (#4758)
Queries by source components are slow, because it contains a WHERE clause that performs string pattern matching on file paths. As a solution SourceComponentFile join table is introduced that stores which files match to a given source component.
1 parent 0ddccf1 commit da970ce

File tree

5 files changed

+207
-81
lines changed

5 files changed

+207
-81
lines changed

web/server/codechecker_server/api/mass_store_run.py

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import base64
1515
from collections import defaultdict
1616
from datetime import datetime, timedelta
17+
import fnmatch
1718
from hashlib import sha256
1819
import json
1920
import os
@@ -51,7 +52,8 @@
5152
ExtendedReportData, \
5253
File, FileContent, \
5354
Report as DBReport, ReportAnnotations, ReviewStatus as ReviewStatusRule, \
54-
Run, RunLock as DBRunLock, RunHistory
55+
Run, RunLock as DBRunLock, RunHistory, \
56+
SourceComponent, SourceComponentFile
5557
from ..metadata import checker_is_unavailable, MetadataInfoParser
5658

5759
from .report_annotations import report_annotation_types
@@ -271,6 +273,54 @@ def get_file_content(file_path: str) -> bytes:
271273
return f.read()
272274

273275

276+
def assign_file_to_source_components(
277+
session: DBSession,
278+
file_id: int,
279+
filepath: str
280+
):
281+
"""
282+
Checks all Source Components and links the file if it matches.
283+
"""
284+
components = session.query(SourceComponent).all()
285+
286+
associations = []
287+
288+
from .report_server import get_component_values
289+
290+
for component in components:
291+
include, skip = get_component_values(component)
292+
293+
# If no patterns are defined, the component matches nothing.
294+
if not skip and not include:
295+
continue
296+
297+
is_included = False
298+
if include:
299+
for pattern in include:
300+
if fnmatch.fnmatch(filepath, pattern):
301+
is_included = True
302+
break
303+
else:
304+
# If only skip is defined, it matches everything except skips.
305+
is_included = True
306+
307+
is_skipped = False
308+
if skip:
309+
for pattern in skip:
310+
if fnmatch.fnmatch(filepath, pattern):
311+
is_skipped = True
312+
break
313+
314+
if is_included and not is_skipped:
315+
associations.append({
316+
'source_component_name': component.name,
317+
'file_id': file_id
318+
})
319+
320+
if associations:
321+
session.bulk_insert_mappings(SourceComponentFile, associations)
322+
323+
274324
def add_file_record(
275325
session: DBSession,
276326
file_path: str,
@@ -319,11 +369,14 @@ def add_file_record(
319369
content_hash=content_hash).on_conflict_do_nothing(
320370
index_elements=['filepath', 'content_hash'])
321371
file_id = session.execute(insert_stmt).inserted_primary_key[0]
372+
assign_file_to_source_components(session, file_id, file_path)
322373
session.commit()
323374
return file_id
324375

325376
file_record = File(file_path, content_hash, None, None)
326377
session.add(file_record)
378+
session.flush()
379+
assign_file_to_source_components(session, file_record.id, file_path)
327380
session.commit()
328381
except sqlalchemy.exc.IntegrityError as ex:
329382
LOG.error(ex)

web/server/codechecker_server/api/report_server.py

Lines changed: 72 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@
7171
File, FileContent, \
7272
Report, ReportAnnotations, ReportAnalysisInfo, ReviewStatus, \
7373
Run, RunHistory, RunHistoryAnalysisInfo, RunLock, \
74-
SourceComponent
74+
SourceComponent, SourceComponentFile
7575

7676
from .common import exc_to_thrift_reqfail
7777
from .thrift_enum_helper import detection_status_enum, \
@@ -148,41 +148,79 @@ def slugify(text):
148148

149149

150150
def get_component_values(
151-
session: DBSession,
152-
component_name: str
151+
component: SourceComponent
153152
) -> Tuple[List[str], List[str]]:
154153
"""
155-
Get component values by component names and returns a tuple where the
156-
first item contains a list path which should be skipped and the second
157-
item contains a list of path which should be included.
154+
Returns a tuple where the first item contains a list paths that should be
155+
included and the second item contains a list of paths that should be
156+
skipped.
158157
E.g.:
159158
+/a/b/x.cpp
160159
+/a/b/y.cpp
161160
-/a/b
162161
On the above component value this function will return the following:
163-
(['/a/b'], ['/a/b/x.cpp', '/a/b/y.cpp'])
162+
(['/a/b/x.cpp', '/a/b/y.cpp'], ['/a/b'])
164163
"""
165-
components = session.query(SourceComponent) \
166-
.filter(SourceComponent.name.like(component_name)) \
167-
.all()
168-
169-
skip = []
170164
include = []
165+
skip = []
171166

172-
for component in components:
173-
values = component.value.decode('utf-8').split('\n')
174-
for value in values:
175-
value = value.strip()
176-
if not value:
177-
continue
167+
values = component.value.decode('utf-8').split('\n')
168+
for value in values:
169+
value = value.strip()
170+
if not value:
171+
continue
172+
173+
v = value[1:]
174+
if value[0] == '+':
175+
include.append(v)
176+
elif value[0] == '-':
177+
skip.append(v)
178+
179+
return include, skip
180+
181+
182+
def update_source_component_files(
183+
session: DBSession,
184+
component: Optional[SourceComponent] = None
185+
):
186+
"""
187+
Refreshes the SourceComponentFile table for a specific source component.
188+
If `component` is None, then all source components are updated.
189+
"""
190+
if component is None:
191+
all_components = session.query(SourceComponent)
192+
else:
193+
all_components = [component]
194+
195+
# 1. Delete existing associations for this component
196+
session.query(SourceComponentFile) \
197+
.filter(SourceComponentFile.source_component_name.in_(
198+
map(lambda component: component.name, all_components))) \
199+
.delete(synchronize_session=False)
200+
201+
for comp in all_components:
202+
# 2. Re-calculate associations
203+
include, skip = get_component_values(comp)
204+
205+
file_ids_query = None
206+
if skip and include:
207+
include_q, skip_q = get_include_skip_queries(include, skip)
208+
file_ids_query = include_q.except_(skip_q)
209+
elif include:
210+
include_q, _ = get_include_skip_queries(include, [])
211+
file_ids_query = include_q
212+
elif skip:
213+
_, skip_q = get_include_skip_queries([], skip)
214+
file_ids_query = select(File.id).where(File.id.notin_(skip_q))
178215

179-
v = value[1:]
180-
if value[0] == '+':
181-
include.append(v)
182-
elif value[0] == '-':
183-
skip.append(v)
216+
if file_ids_query is not None:
217+
file_ids = session.execute(file_ids_query).fetchall()
184218

185-
return skip, include
219+
if file_ids:
220+
session.bulk_insert_mappings(
221+
SourceComponentFile,
222+
[{'source_component_name': comp.name,
223+
'file_id': fid[0]} for fid in file_ids])
186224

187225

188226
def process_report_filter(
@@ -508,18 +546,10 @@ def get_source_component_file_query(
508546
component_name: str
509547
):
510548
""" Get filter query for a single source component. """
511-
skip, include = get_component_values(session, component_name)
512-
513-
if skip and include:
514-
include_q, skip_q = get_include_skip_queries(include, skip)
515-
return File.id.in_(include_q.except_(skip_q))
516-
517-
if include:
518-
return or_(*[File.filepath.like(conv(fp)) for fp in include])
519-
elif skip:
520-
return and_(*[not_(File.filepath.like(conv(fp))) for fp in skip])
521-
522-
return None
549+
return File.id.in_(
550+
session.query(SourceComponentFile.file_id)
551+
.filter(SourceComponentFile.source_component_name == component_name)
552+
)
523553

524554

525555
def get_reports_by_bugpath_filter_for_single_origin(
@@ -637,28 +667,12 @@ def get_other_source_component_file_query(session):
637667
(Files NOT IN Component_1) AND (Files NOT IN Component_2) ... AND
638668
(Files NOT IN Component_N)
639669
"""
640-
component_names = session.query(SourceComponent.name).all()
641-
642-
# If there are no user defined source components we don't have to filter.
643-
if not component_names:
670+
# Check if there are any source components
671+
if not session.query(SourceComponent).count():
644672
return None
645673

646-
def get_query(component_name: str):
647-
""" Get file filter query for auto generated Other component. """
648-
skip, include = get_component_values(session, component_name)
649-
650-
if skip and include:
651-
include_q, skip_q = get_include_skip_queries(include, skip)
652-
return File.id.notin_(include_q.except_(skip_q))
653-
elif include:
654-
return and_(*[File.filepath.notlike(conv(fp)) for fp in include])
655-
elif skip:
656-
return or_(*[File.filepath.like(conv(fp)) for fp in skip])
657-
658-
return None
659-
660-
queries = [get_query(n) for (n, ) in component_names]
661-
return and_(*queries)
674+
files_in_components = session.query(SourceComponentFile.file_id)
675+
return File.id.notin_(files_in_components)
662676

663677

664678
def get_open_reports_date_filter_query(tbl=Report, date=RunHistory.time):
@@ -3899,6 +3913,8 @@ def addSourceComponent(self, name, value, description):
38993913
user)
39003914

39013915
session.add(component)
3916+
update_source_component_files(session, component)
3917+
39023918
session.commit()
39033919

39043920
return True

web/server/codechecker_server/database/run_db_model.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -585,6 +585,23 @@ def __init__(self, name, due_date=None, description=None, closed_at=None):
585585
self.closed_at = closed_at
586586

587587

588+
class SourceComponentFile(Base):
589+
__tablename__ = 'source_component_files'
590+
591+
source_component_name = Column(String,
592+
ForeignKey('source_components.name',
593+
ondelete='CASCADE'),
594+
primary_key=True)
595+
file_id = Column(Integer,
596+
ForeignKey('files.id',
597+
ondelete='CASCADE'),
598+
primary_key=True)
599+
600+
def __init__(self, source_component_name, file_id):
601+
self.source_component_name = source_component_name
602+
self.file_id = file_id
603+
604+
588605
class CleanupPlanReportHash(Base):
589606
__tablename__ = 'cleanup_plan_report_hashes'
590607

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
"""
2+
Add SourceComponentFile join table
3+
4+
Revision ID: 198654dac219
5+
Revises: c3dad71f8e6b
6+
Create Date: 2026-01-14 15:43:27.225574
7+
"""
8+
9+
from logging import getLogger
10+
11+
from alembic import op
12+
import sqlalchemy as sa
13+
from sqlalchemy.orm import Session
14+
15+
from codechecker_server.api.report_server import \
16+
update_source_component_files
17+
18+
19+
# Revision identifiers, used by Alembic.
20+
revision = '198654dac219'
21+
down_revision = 'c3dad71f8e6b'
22+
branch_labels = None
23+
depends_on = None
24+
25+
26+
def upgrade():
27+
LOG = getLogger("migration/report")
28+
# ### commands auto generated by Alembic - please adjust! ###
29+
op.create_table(
30+
'source_component_files',
31+
sa.Column('source_component_name', sa.String(), nullable=False),
32+
sa.Column('file_id', sa.Integer(), nullable=False),
33+
sa.ForeignKeyConstraint(
34+
['file_id'],
35+
['files.id'],
36+
name=op.f('fk_source_component_files_file_id_files'),
37+
ondelete='CASCADE'),
38+
sa.ForeignKeyConstraint(
39+
['source_component_name'],
40+
['source_components.name'],
41+
name=op.f(
42+
'fk_source_component_files_source_component_name_'
43+
'source_components'),
44+
ondelete='CASCADE'),
45+
sa.PrimaryKeyConstraint(
46+
'source_component_name',
47+
'file_id',
48+
name=op.f('pk_source_component_files'))
49+
)
50+
51+
conn = op.get_bind()
52+
53+
session = Session(bind=conn)
54+
55+
update_source_component_files(session)
56+
57+
# ### end Alembic commands ###
58+
59+
60+
def downgrade():
61+
LOG = getLogger("migration/report")
62+
# ### commands auto generated by Alembic - please adjust! ###
63+
op.drop_table('source_component_files')
64+
# ### end Alembic commands ###

web/tests/functional/report_viewer_api/test_report_counting.py

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -283,30 +283,6 @@ def test_run1_all_file(self):
283283
self.assertEqual(len(res), len(self.run1_files))
284284
self.assertDictEqual(res, self.run1_files)
285285

286-
def test_filter_by_file_and_source_component(self):
287-
"""
288-
File and source component filter in getFileCounts().
289-
290-
Earlier this function resulted an SQL error due to an invalid SQL
291-
statement (File table was ambiguously used, because it was joined
292-
multiple times).
293-
294-
On the other hand we test here that getFileCounts() returns the number
295-
of reports in all files regardless the filter fields. The reason is
296-
that it wouldn't be possible on the GUI to display the options of the
297-
file path filter which doesn't contain a report (i.e. their endpoint is
298-
not in that file). In the future it would be enough to ignore the
299-
filter only if "anywhere on bugpath" option is used (TODO?).
300-
"""
301-
runid = self._runids[0]
302-
run_filter = ReportFilter(
303-
filepath="call*",
304-
componentNames=["doesn't exist"])
305-
file_counts = self._cc_client.getFileCounts(
306-
[runid], run_filter, None, None, 0)
307-
308-
self.assertEqual(len(file_counts), len(self.run1_files))
309-
310286
def test_run2_all_file(self):
311287
"""
312288
Get all the file counts for run2.

0 commit comments

Comments
 (0)