Skip to content

Commit 0ec3a49

Browse files
authored
feat: add pluggable persistence backends with storage API (#93)
* ci: add C extension code coverage with lcov Add gcov/lcov-based coverage for the native C extension. Coverage is compiled with -O0 --coverage flags and generates HTML reports showing line and function coverage for vector.c, matrix.c, svd.c, and classifier_ext.c. New rake tasks: - coverage:clean - remove gcov data files - coverage:compile - build with coverage instrumentation - coverage:report - generate HTML report via lcov - coverage:c - full cycle (clean, compile, test, report) CI uploads coverage reports as artifacts on each run. * feat: add pluggable persistence backends with storage API Implement ActiveRecord-style storage abstraction allowing classifiers to persist to any backend. Users can configure storage at setup time and use simple save/reload methods without path arguments. Key additions: - Storage::Base abstract class defining write/read/delete/exists? protocol - Storage::Memory for testing and ephemeral use - Storage::File for file-based persistence - Dirty tracking with dirty? method on Bayes and LSI - reload raises UnsavedChangesError, reload! forces discard - load(storage:) class method sets storage on returned instance Legacy API preserved: - save_to_file(path) replaces old save(path) - load_from_file(path) replaces old load(path) Closes #90 * fix: add super calls to storage class initializers * style: fix RuboCop offenses in storage tests * fix: add auto_rebuild to remove_item and fix Rakefile lint
1 parent 298dedb commit 0ec3a49

File tree

15 files changed

+1050
-35
lines changed

15 files changed

+1050
-35
lines changed

.github/workflows/ruby.yml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,23 @@ jobs:
5252
- name: Type check with Steep
5353
if: matrix.ruby-version != '4.0'
5454
run: bundle exec steep check
55+
56+
coverage:
57+
runs-on: ubuntu-latest
58+
steps:
59+
- uses: actions/checkout@v4
60+
- name: Set up Ruby
61+
uses: ruby/setup-ruby@v1
62+
with:
63+
ruby-version: '3.4'
64+
bundler-cache: true
65+
- name: Install lcov
66+
run: sudo apt-get update && sudo apt-get install -y lcov
67+
- name: Run C coverage
68+
run: bundle exec rake coverage:c
69+
- name: Upload C coverage report
70+
uses: actions/upload-artifact@v4
71+
with:
72+
name: c-coverage-report
73+
path: coverage/c/html/
74+
retention-days: 30

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ pkg/
99
*.bundle
1010
*.so
1111
*.o
12+
*.gcda
13+
*.gcno
1214
tmp/
1315
lib/classifier/classifier_ext.*
1416
Makefile

Rakefile

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,88 @@ task :publish do
6565
`ssh rufy update-classifier-doc`
6666
Rake::RubyForgePublisher.new('classifier', 'cardmagic').upload
6767
end
68+
69+
# C Code Coverage tasks
70+
namespace :coverage do # rubocop:disable Metrics/BlockLength
71+
desc 'Clean C coverage data files'
72+
task :clean do
73+
FileUtils.rm_f(Dir.glob('ext/classifier/**/*.gcda'))
74+
FileUtils.rm_f(Dir.glob('ext/classifier/**/*.gcno'))
75+
FileUtils.rm_f(Dir.glob('tmp/**/classifier/**/*.gcda'))
76+
FileUtils.rm_f(Dir.glob('tmp/**/classifier/**/*.gcno'))
77+
FileUtils.rm_rf('coverage/c')
78+
end
79+
80+
desc 'Compile C extension with coverage instrumentation'
81+
task :compile do
82+
ENV['COVERAGE'] = '1'
83+
Rake::Task['clobber'].invoke if Rake::Task.task_defined?('clobber')
84+
Rake::Task['compile'].reenable
85+
Rake::Task['compile'].invoke
86+
end
87+
88+
desc 'Generate C coverage report using lcov'
89+
task :report do # rubocop:disable Metrics/BlockLength
90+
project_root = File.expand_path(__dir__)
91+
ext_dir = File.join(project_root, 'ext/classifier')
92+
# Find the directory containing .gcda files (build directory varies by platform/Ruby version)
93+
tmp_ext_dir = Dir.glob('tmp/**/classifier_ext/**/*.gcda').first&.then { |f| File.dirname(f) }
94+
coverage_dir = 'coverage/c'
95+
96+
unless tmp_ext_dir
97+
puts 'No coverage data found. Run tests with coverage first.'
98+
next
99+
end
100+
101+
FileUtils.mkdir_p(coverage_dir)
102+
103+
# Run gcov manually in the build directory to generate .gcov files
104+
Dir.chdir(tmp_ext_dir) do
105+
# Find all source files and run gcov on them
106+
gcda_files = Dir.glob('*.gcda')
107+
gcda_files.each do |gcda|
108+
# Source file is in ext/classifier, referenced via relative path in the gcno
109+
sh "gcov -o . #{gcda} 2>/dev/null || true"
110+
end
111+
end
112+
113+
# Capture coverage data with base directory for source resolution
114+
sh "lcov --capture --directory #{tmp_ext_dir} --base-directory #{ext_dir} " \
115+
"--output-file #{coverage_dir}/coverage.info " \
116+
'--ignore-errors inconsistent,gcov,source 2>&1 || true'
117+
118+
if File.exist?("#{coverage_dir}/coverage.info") && File.size("#{coverage_dir}/coverage.info").positive?
119+
# Filter out system headers
120+
sh "lcov --remove #{coverage_dir}/coverage.info '/usr/*' '*/ruby/*' " \
121+
"--output-file #{coverage_dir}/coverage.info --ignore-errors unused 2>/dev/null || true"
122+
123+
# Fix source paths: the gcov relative paths resolve incorrectly
124+
# Substitute wrong paths with correct absolute paths
125+
info_content = File.read("#{coverage_dir}/coverage.info")
126+
info_content.gsub!(%r{SF:.*/ext/classifier/}, "SF:#{ext_dir}/")
127+
File.write("#{coverage_dir}/coverage.info", info_content)
128+
129+
# Generate HTML report
130+
sh "genhtml #{coverage_dir}/coverage.info --output-directory #{coverage_dir}/html " \
131+
"--prefix #{project_root} --ignore-errors unmapped,source 2>&1 || true"
132+
133+
puts "\nC coverage report generated at: #{coverage_dir}/html/index.html"
134+
135+
# Print summary
136+
sh "lcov --summary #{coverage_dir}/coverage.info 2>/dev/null || true"
137+
else
138+
puts 'Coverage data capture failed. Check that tests exercise the C extension.'
139+
end
140+
end
141+
142+
desc 'Run tests and generate C coverage report'
143+
task :run do
144+
Rake::Task['coverage:clean'].invoke
145+
Rake::Task['coverage:compile'].invoke
146+
Rake::Task['test'].invoke
147+
Rake::Task['coverage:report'].invoke
148+
end
149+
end
150+
151+
desc 'Run C code coverage (alias for coverage:run)'
152+
task 'coverage:c' => 'coverage:run'

ext/classifier/extconf.rb

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,15 @@
11
require 'mkmf'
22

3-
# Optimization flags for performance
4-
$CFLAGS << ' -O3 -ffast-math -Wall' # rubocop:disable Style/GlobalVars
3+
# rubocop:disable Style/GlobalVars
4+
if ENV['COVERAGE']
5+
# Coverage flags: disable optimization for accurate line coverage
6+
$CFLAGS << ' -O0 -g --coverage -Wall'
7+
$LDFLAGS << ' --coverage'
8+
else
9+
# Optimization flags for performance
10+
$CFLAGS << ' -O3 -ffast-math -Wall'
11+
end
12+
# rubocop:enable Style/GlobalVars
513

614
# Create the Makefile
715
create_makefile('classifier/classifier_ext')

lib/classifier.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
# License:: LGPL
2626

2727
require 'rubygems'
28+
require 'classifier/errors'
29+
require 'classifier/storage'
2830
require 'classifier/extensions/string'
2931
require 'classifier/extensions/vector'
3032
require 'classifier/bayes'

lib/classifier/bayes.rb

Lines changed: 95 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ class Bayes
1717
# @rbs @category_word_count: Hash[Symbol, Integer]
1818
# @rbs @cached_training_count: Float?
1919
# @rbs @cached_vocab_size: Integer?
20+
# @rbs @dirty: bool
21+
# @rbs @storage: Storage::Base?
22+
23+
attr_accessor :storage
2024

2125
# The class can be created with one or more categories, each of which will be
2226
# initialized and given a training method. E.g.,
@@ -31,6 +35,8 @@ def initialize(*categories)
3135
@category_word_count = Hash.new(0)
3236
@cached_training_count = nil
3337
@cached_vocab_size = nil
38+
@dirty = false
39+
@storage = nil
3440
end
3541

3642
# Provides a general training method for all categories specified in Bayes#new
@@ -46,6 +52,7 @@ def train(category, text)
4652
word_hash = text.word_hash
4753
synchronize do
4854
invalidate_caches
55+
@dirty = true
4956
@category_counts[category] += 1
5057
word_hash.each do |word, count|
5158
@categories[category][word] ||= 0
@@ -70,6 +77,7 @@ def untrain(category, text)
7077
word_hash = text.word_hash
7178
synchronize do
7279
invalidate_caches
80+
@dirty = true
7381
@category_counts[category] -= 1
7482
word_hash.each do |word, count|
7583
next unless @total_words >= 0
@@ -159,17 +167,81 @@ def self.from_json(json)
159167
instance
160168
end
161169

162-
# Saves the classifier state to a file.
170+
# Saves the classifier to the configured storage.
171+
# Raises ArgumentError if no storage is configured.
172+
#
173+
# @rbs () -> void
174+
def save
175+
raise ArgumentError, 'No storage configured. Use save_to_file(path) or set storage=' unless storage
176+
177+
storage.write(to_json)
178+
@dirty = false
179+
end
180+
181+
# Saves the classifier state to a file (legacy API).
163182
#
164183
# @rbs (String) -> Integer
165-
def save(path)
166-
File.write(path, to_json)
184+
def save_to_file(path)
185+
result = File.write(path, to_json)
186+
@dirty = false
187+
result
188+
end
189+
190+
# Reloads the classifier from the configured storage.
191+
# Raises UnsavedChangesError if there are unsaved changes.
192+
# Use reload! to force reload and discard changes.
193+
#
194+
# @rbs () -> self
195+
def reload
196+
raise ArgumentError, 'No storage configured' unless storage
197+
raise UnsavedChangesError, 'Unsaved changes would be lost. Call save first or use reload!' if @dirty
198+
199+
data = storage.read
200+
raise StorageError, 'No saved state found' unless data
201+
202+
restore_from_json(data)
203+
@dirty = false
204+
self
205+
end
206+
207+
# Force reloads the classifier from storage, discarding any unsaved changes.
208+
#
209+
# @rbs () -> self
210+
def reload!
211+
raise ArgumentError, 'No storage configured' unless storage
212+
213+
data = storage.read
214+
raise StorageError, 'No saved state found' unless data
215+
216+
restore_from_json(data)
217+
@dirty = false
218+
self
219+
end
220+
221+
# Returns true if there are unsaved changes.
222+
#
223+
# @rbs () -> bool
224+
def dirty?
225+
@dirty
167226
end
168227

169-
# Loads a classifier from a file saved with #save.
228+
# Loads a classifier from the configured storage.
229+
# The storage is set on the returned instance.
230+
#
231+
# @rbs (storage: Storage::Base) -> Bayes
232+
def self.load(storage:)
233+
data = storage.read
234+
raise StorageError, 'No saved state found' unless data
235+
236+
instance = from_json(data)
237+
instance.storage = storage
238+
instance
239+
end
240+
241+
# Loads a classifier from a file (legacy API).
170242
#
171243
# @rbs (String) -> Bayes
172-
def self.load(path)
244+
def self.load_from_file(path)
173245
from_json(File.read(path))
174246
end
175247

@@ -219,6 +291,7 @@ def categories
219291
def add_category(category)
220292
synchronize do
221293
invalidate_caches
294+
@dirty = true
222295
@categories[category.prepare_category_name] = {}
223296
end
224297
end
@@ -228,16 +301,17 @@ def add_category(category)
228301
# Custom marshal serialization to exclude mutex state
229302
# @rbs () -> Array[untyped]
230303
def marshal_dump
231-
[@categories, @total_words, @category_counts, @category_word_count]
304+
[@categories, @total_words, @category_counts, @category_word_count, @dirty]
232305
end
233306

234307
# Custom marshal deserialization to recreate mutex
235308
# @rbs (Array[untyped]) -> void
236309
def marshal_load(data)
237310
mu_initialize
238-
@categories, @total_words, @category_counts, @category_word_count = data
311+
@categories, @total_words, @category_counts, @category_word_count, @dirty = data
239312
@cached_training_count = nil
240313
@cached_vocab_size = nil
314+
@storage = nil
241315
end
242316

243317
# Allows you to remove categories from the classifier.
@@ -255,6 +329,7 @@ def remove_category(category)
255329
raise StandardError, "No such category: #{category}" unless @categories.key?(category)
256330

257331
invalidate_caches
332+
@dirty = true
258333
@total_words -= @category_word_count[category].to_i
259334

260335
@categories.delete(category)
@@ -265,6 +340,17 @@ def remove_category(category)
265340

266341
private
267342

343+
# Restores classifier state from a JSON string (used by reload)
344+
# @rbs (String) -> void
345+
def restore_from_json(json)
346+
data = JSON.parse(json)
347+
raise ArgumentError, "Invalid classifier type: #{data['type']}" unless data['type'] == 'bayes'
348+
349+
synchronize do
350+
restore_state(data)
351+
end
352+
end
353+
268354
# Restores classifier state from a hash (used by from_json)
269355
# @rbs (Hash[String, untyped]) -> void
270356
def restore_state(data)
@@ -275,6 +361,8 @@ def restore_state(data)
275361
@category_word_count = Hash.new(0) #: Hash[Symbol, Integer]
276362
@cached_training_count = nil
277363
@cached_vocab_size = nil
364+
@dirty = false
365+
@storage = nil
278366

279367
data['categories'].each do |cat_name, words|
280368
@categories[cat_name.to_sym] = words.transform_keys(&:to_sym)

lib/classifier/errors.rb

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# rbs_inline: enabled
2+
3+
# Author:: Lucas Carlson (mailto:[email protected])
4+
# Copyright:: Copyright (c) 2005 Lucas Carlson
5+
# License:: LGPL
6+
7+
module Classifier
8+
# Base error class for all Classifier errors
9+
class Error < StandardError; end
10+
11+
# Raised when reload would discard unsaved changes
12+
class UnsavedChangesError < Error; end
13+
14+
# Raised when a storage operation fails
15+
class StorageError < Error; end
16+
end

0 commit comments

Comments
 (0)