Skip to content

Commit cb10121

Browse files
feat(stt): add recognize enrichments, add new function detectLanguage
1 parent 98b4622 commit cb10121

File tree

2 files changed

+331
-6
lines changed

2 files changed

+331
-6
lines changed

speech-to-text/v1-generated.ts

Lines changed: 229 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* (C) Copyright IBM Corp. 2025.
2+
s * (C) Copyright IBM Corp. 2026.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -343,6 +343,13 @@ class SpeechToTextV1 extends BaseService {
343343
* `sad_module: 2` to increase accuracy and performance in detecting speech boundaries within the audio stream. See
344344
* [Using speech recognition
345345
* parameters](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-service-features#features-parameters).
346+
* @param {string} [params.enrichments] - Speech transcript enrichment improves readability of raw ASR transcripts by
347+
* adding punctuation (periods, commas, question marks, exclamation points) and intelligent capitalization (sentence
348+
* beginnings, proper nouns, acronyms, brand names). To enable enrichment, add the `enrichments=punctuation` parameter
349+
* to your recognition request. Supported languages include English (US, UK, Australia, India), French (France,
350+
* Canada), German, Italian, Portuguese (Brazil, Portugal), Spanish (Spain, Latin America, Argentina, Chile, Colombia,
351+
* Mexico, Peru), and Japanese. See [Speech transcript
352+
* enrichment](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-speech-transcript-enrichment).
346353
* @param {string} [params.languageCustomizationId] - The customization ID (GUID) of a custom language model that is
347354
* to be used with the recognition request. The base model of the specified custom language model must match the model
348355
* specified with the `model` parameter. You must make the request with credentials for the instance of the service
@@ -569,7 +576,7 @@ class SpeechToTextV1 extends BaseService {
569576
): Promise<SpeechToTextV1.Response<SpeechToTextV1.SpeechRecognitionResults>> {
570577
const _params = { ...params };
571578
const _requiredParams = ['audio'];
572-
const _validParams = ['audio', 'contentType', 'model', 'speechBeginEvent', 'languageCustomizationId', 'acousticCustomizationId', 'baseModelVersion', 'customizationWeight', 'inactivityTimeout', 'keywords', 'keywordsThreshold', 'maxAlternatives', 'wordAlternativesThreshold', 'wordConfidence', 'timestamps', 'profanityFilter', 'smartFormatting', 'smartFormattingVersion', 'speakerLabels', 'grammarName', 'redaction', 'audioMetrics', 'endOfPhraseSilenceTime', 'splitTranscriptAtPhraseEnd', 'speechDetectorSensitivity', 'sadModule', 'backgroundAudioSuppression', 'lowLatency', 'characterInsertionBias', 'signal', 'headers'];
579+
const _validParams = ['audio', 'contentType', 'model', 'speechBeginEvent', 'enrichments', 'languageCustomizationId', 'acousticCustomizationId', 'baseModelVersion', 'customizationWeight', 'inactivityTimeout', 'keywords', 'keywordsThreshold', 'maxAlternatives', 'wordAlternativesThreshold', 'wordConfidence', 'timestamps', 'profanityFilter', 'smartFormatting', 'smartFormattingVersion', 'speakerLabels', 'grammarName', 'redaction', 'audioMetrics', 'endOfPhraseSilenceTime', 'splitTranscriptAtPhraseEnd', 'speechDetectorSensitivity', 'sadModule', 'backgroundAudioSuppression', 'lowLatency', 'characterInsertionBias', 'signal', 'headers'];
573580
const _validationErrors = validateParams(_params, _requiredParams, _validParams);
574581
if (_validationErrors) {
575582
return Promise.reject(_validationErrors);
@@ -579,6 +586,7 @@ class SpeechToTextV1 extends BaseService {
579586
const query = {
580587
'model': _params.model,
581588
'speech_begin_event': _params.speechBeginEvent,
589+
'enrichments': _params.enrichments,
582590
'language_customization_id': _params.languageCustomizationId,
583591
'acoustic_customization_id': _params.acousticCustomizationId,
584592
'base_model_version': _params.baseModelVersion,
@@ -939,6 +947,20 @@ class SpeechToTextV1 extends BaseService {
939947
* @param {number} [params.resultsTtl] - The number of minutes for which the results are to be available after the job
940948
* has finished. If not delivered via a callback, the results must be retrieved within this time. Omit the parameter
941949
* to use a time to live of one week. The parameter is valid with or without a callback URL.
950+
* @param {boolean} [params.speechBeginEvent] - If `true`, the service returns a response object `SpeechActivity`
951+
* which contains the time when a speech activity is detected in the stream. This can be used both in standard and low
952+
* latency mode. This feature enables client applications to know that some words/speech has been detected and the
953+
* service is in the process of decoding. This can be used in lieu of interim results in standard mode. Use
954+
* `sad_module: 2` to increase accuracy and performance in detecting speech boundaries within the audio stream. See
955+
* [Using speech recognition
956+
* parameters](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-service-features#features-parameters).
957+
* @param {string} [params.enrichments] - Speech transcript enrichment improves readability of raw ASR transcripts by
958+
* adding punctuation (periods, commas, question marks, exclamation points) and intelligent capitalization (sentence
959+
* beginnings, proper nouns, acronyms, brand names). To enable enrichment, add the `enrichments=punctuation` parameter
960+
* to your recognition request. Supported languages include English (US, UK, Australia, India), French (France,
961+
* Canada), German, Italian, Portuguese (Brazil, Portugal), Spanish (Spain, Latin America, Argentina, Chile, Colombia,
962+
* Mexico, Peru), and Japanese. See [Speech transcript
963+
* enrichment](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-speech-transcript-enrichment).
942964
* @param {string} [params.languageCustomizationId] - The customization ID (GUID) of a custom language model that is
943965
* to be used with the recognition request. The base model of the specified custom language model must match the model
944966
* specified with the `model` parameter. You must make the request with credentials for the instance of the service
@@ -1185,7 +1207,7 @@ class SpeechToTextV1 extends BaseService {
11851207
): Promise<SpeechToTextV1.Response<SpeechToTextV1.RecognitionJob>> {
11861208
const _params = { ...params };
11871209
const _requiredParams = ['audio'];
1188-
const _validParams = ['audio', 'contentType', 'model', 'callbackUrl', 'events', 'userToken', 'resultsTtl', 'languageCustomizationId', 'acousticCustomizationId', 'baseModelVersion', 'customizationWeight', 'inactivityTimeout', 'keywords', 'keywordsThreshold', 'maxAlternatives', 'wordAlternativesThreshold', 'wordConfidence', 'timestamps', 'profanityFilter', 'smartFormatting', 'smartFormattingVersion', 'speakerLabels', 'grammarName', 'redaction', 'processingMetrics', 'processingMetricsInterval', 'audioMetrics', 'endOfPhraseSilenceTime', 'splitTranscriptAtPhraseEnd', 'speechDetectorSensitivity', 'sadModule', 'backgroundAudioSuppression', 'lowLatency', 'characterInsertionBias', 'signal', 'headers'];
1210+
const _validParams = ['audio', 'contentType', 'model', 'callbackUrl', 'events', 'userToken', 'resultsTtl', 'speechBeginEvent', 'enrichments', 'languageCustomizationId', 'acousticCustomizationId', 'baseModelVersion', 'customizationWeight', 'inactivityTimeout', 'keywords', 'keywordsThreshold', 'maxAlternatives', 'wordAlternativesThreshold', 'wordConfidence', 'timestamps', 'profanityFilter', 'smartFormatting', 'smartFormattingVersion', 'speakerLabels', 'grammarName', 'redaction', 'processingMetrics', 'processingMetricsInterval', 'audioMetrics', 'endOfPhraseSilenceTime', 'splitTranscriptAtPhraseEnd', 'speechDetectorSensitivity', 'sadModule', 'backgroundAudioSuppression', 'lowLatency', 'characterInsertionBias', 'signal', 'headers'];
11891211
const _validationErrors = validateParams(_params, _requiredParams, _validParams);
11901212
if (_validationErrors) {
11911213
return Promise.reject(_validationErrors);
@@ -1198,6 +1220,8 @@ class SpeechToTextV1 extends BaseService {
11981220
'events': _params.events,
11991221
'user_token': _params.userToken,
12001222
'results_ttl': _params.resultsTtl,
1223+
'speech_begin_event': _params.speechBeginEvent,
1224+
'enrichments': _params.enrichments,
12011225
'language_customization_id': _params.languageCustomizationId,
12021226
'acoustic_customization_id': _params.acousticCustomizationId,
12031227
'base_model_version': _params.baseModelVersion,
@@ -4185,6 +4209,70 @@ class SpeechToTextV1 extends BaseService {
41854209
}),
41864210
};
41874211

4212+
return this.createRequest(parameters);
4213+
}
4214+
/*************************
4215+
* languageIdentification
4216+
************************/
4217+
4218+
/**
4219+
* Spoken language identification.
4220+
*
4221+
* Detects the spoken language in audio streams. The endpoint is `/v1/detect_language` and user can optionally include
4222+
* `lid_confidence` parameter to set a custom confidence threshold for detection. The model continuously processes
4223+
* incoming audio and returns the identified language when it reaches a confidence level higher than the specified
4224+
* threshold (0.99 by default). See [Spoken language
4225+
* identification](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-speech-language-identification).
4226+
*
4227+
* @param {Object} params - The parameters to send to the service.
4228+
* @param {number} params.lidConfidence - Set a custom confidence threshold for detection.
4229+
* @param {NodeJS.ReadableStream | Buffer} params.audio - The audio to transcribe.
4230+
* @param {string} [params.contentType] - The type of the input.
4231+
* @param {OutgoingHttpHeaders} [params.headers] - Custom request headers
4232+
* @returns {Promise<SpeechToTextV1.Response<SpeechToTextV1.LanguageDetectionResults>>}
4233+
*/
4234+
public detectLanguage(
4235+
params: SpeechToTextV1.DetectLanguageParams
4236+
): Promise<SpeechToTextV1.Response<SpeechToTextV1.LanguageDetectionResults>> {
4237+
const _params = { ...params };
4238+
const _requiredParams = ['lidConfidence', 'audio'];
4239+
const _validParams = ['lidConfidence', 'audio', 'contentType', 'signal', 'headers'];
4240+
const _validationErrors = validateParams(_params, _requiredParams, _validParams);
4241+
if (_validationErrors) {
4242+
return Promise.reject(_validationErrors);
4243+
}
4244+
4245+
const body = _params.audio;
4246+
const query = {
4247+
'lid_confidence': _params.lidConfidence,
4248+
};
4249+
4250+
const sdkHeaders = getSdkHeaders(SpeechToTextV1.DEFAULT_SERVICE_NAME, 'v1', 'detectLanguage');
4251+
4252+
const parameters = {
4253+
options: {
4254+
url: '/v1/detect_language',
4255+
method: 'POST',
4256+
body,
4257+
qs: query,
4258+
},
4259+
defaultOptions: extend(true, {}, this.baseOptions, {
4260+
headers: extend(
4261+
true,
4262+
sdkHeaders,
4263+
this.baseOptions.headers,
4264+
{
4265+
'Accept': 'application/json',
4266+
'Content-Type': _params.contentType,
4267+
},
4268+
_params.headers
4269+
),
4270+
axiosOptions: {
4271+
signal: _params.signal,
4272+
},
4273+
}),
4274+
};
4275+
41884276
return this.createRequest(parameters);
41894277
}
41904278
}
@@ -4356,6 +4444,15 @@ namespace SpeechToTextV1 {
43564444
* parameters](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-service-features#features-parameters).
43574445
*/
43584446
speechBeginEvent?: boolean;
4447+
/** Speech transcript enrichment improves readability of raw ASR transcripts by adding punctuation (periods,
4448+
* commas, question marks, exclamation points) and intelligent capitalization (sentence beginnings, proper nouns,
4449+
* acronyms, brand names). To enable enrichment, add the `enrichments=punctuation` parameter to your recognition
4450+
* request. Supported languages include English (US, UK, Australia, India), French (France, Canada), German,
4451+
* Italian, Portuguese (Brazil, Portugal), Spanish (Spain, Latin America, Argentina, Chile, Colombia, Mexico,
4452+
* Peru), and Japanese. See [Speech transcript
4453+
* enrichment](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-speech-transcript-enrichment).
4454+
*/
4455+
enrichments?: string;
43594456
/** The customization ID (GUID) of a custom language model that is to be used with the recognition request. The
43604457
* base model of the specified custom language model must match the model specified with the `model` parameter. You
43614458
* must make the request with credentials for the instance of the service that owns the custom model. By default,
@@ -4810,6 +4907,23 @@ namespace SpeechToTextV1 {
48104907
* week. The parameter is valid with or without a callback URL.
48114908
*/
48124909
resultsTtl?: number;
4910+
/** If `true`, the service returns a response object `SpeechActivity` which contains the time when a speech
4911+
* activity is detected in the stream. This can be used both in standard and low latency mode. This feature enables
4912+
* client applications to know that some words/speech has been detected and the service is in the process of
4913+
* decoding. This can be used in lieu of interim results in standard mode. Use `sad_module: 2` to increase accuracy
4914+
* and performance in detecting speech boundaries within the audio stream. See [Using speech recognition
4915+
* parameters](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-service-features#features-parameters).
4916+
*/
4917+
speechBeginEvent?: boolean;
4918+
/** Speech transcript enrichment improves readability of raw ASR transcripts by adding punctuation (periods,
4919+
* commas, question marks, exclamation points) and intelligent capitalization (sentence beginnings, proper nouns,
4920+
* acronyms, brand names). To enable enrichment, add the `enrichments=punctuation` parameter to your recognition
4921+
* request. Supported languages include English (US, UK, Australia, India), French (France, Canada), German,
4922+
* Italian, Portuguese (Brazil, Portugal), Spanish (Spain, Latin America, Argentina, Chile, Colombia, Mexico,
4923+
* Peru), and Japanese. See [Speech transcript
4924+
* enrichment](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-speech-transcript-enrichment).
4925+
*/
4926+
enrichments?: string;
48134927
/** The customization ID (GUID) of a custom language model that is to be used with the recognition request. The
48144928
* base model of the specified custom language model must match the model specified with the `model` parameter. You
48154929
* must make the request with credentials for the instance of the service that owns the custom model. By default,
@@ -6096,6 +6210,39 @@ namespace SpeechToTextV1 {
60966210
customerId: string;
60976211
}
60986212

6213+
/** Parameters for the `detectLanguage` operation. */
6214+
export interface DetectLanguageParams extends DefaultParams {
6215+
/** Set a custom confidence threshold for detection. */
6216+
lidConfidence: number;
6217+
/** The audio to transcribe. */
6218+
audio: NodeJS.ReadableStream | Buffer;
6219+
/** The type of the input. */
6220+
contentType?: DetectLanguageConstants.ContentType | string;
6221+
}
6222+
6223+
/** Constants for the `detectLanguage` operation. */
6224+
export namespace DetectLanguageConstants {
6225+
/** The type of the input. */
6226+
export enum ContentType {
6227+
APPLICATION_OCTET_STREAM = 'application/octet-stream',
6228+
AUDIO_ALAW = 'audio/alaw',
6229+
AUDIO_BASIC = 'audio/basic',
6230+
AUDIO_FLAC = 'audio/flac',
6231+
AUDIO_G729 = 'audio/g729',
6232+
AUDIO_L16 = 'audio/l16',
6233+
AUDIO_MP3 = 'audio/mp3',
6234+
AUDIO_MPEG = 'audio/mpeg',
6235+
AUDIO_MULAW = 'audio/mulaw',
6236+
AUDIO_OGG = 'audio/ogg',
6237+
AUDIO_OGG_CODECS_OPUS = 'audio/ogg;codecs=opus',
6238+
AUDIO_OGG_CODECS_VORBIS = 'audio/ogg;codecs=vorbis',
6239+
AUDIO_WAV = 'audio/wav',
6240+
AUDIO_WEBM = 'audio/webm',
6241+
AUDIO_WEBM_CODECS_OPUS = 'audio/webm;codecs=opus',
6242+
AUDIO_WEBM_CODECS_VORBIS = 'audio/webm;codecs=vorbis',
6243+
}
6244+
}
6245+
60996246
/*************************
61006247
* model interfaces
61016248
************************/
@@ -6485,6 +6632,44 @@ namespace SpeechToTextV1 {
64856632
display_as?: string;
64866633
}
64876634

6635+
/**
6636+
* If enriched results are requested, transcription with inserted punctuation marks such as periods, commas, question
6637+
* marks, and exclamation points.
6638+
*/
6639+
export interface EnrichedResults {
6640+
/** If enriched results are requested, transcription with inserted punctuation marks such as periods, commas,
6641+
* question marks, and exclamation points.
6642+
*/
6643+
transcript?: EnrichedResultsTranscript;
6644+
/** The status of the enriched transcription. */
6645+
status?: string;
6646+
}
6647+
6648+
/**
6649+
* If enriched results are requested, transcription with inserted punctuation marks such as periods, commas, question
6650+
* marks, and exclamation points.
6651+
*/
6652+
export interface EnrichedResultsTranscript {
6653+
/** The transcript text. */
6654+
text?: string;
6655+
/** The speaking time from the beginning of the transcript to the end. */
6656+
timestamp?: EnrichedResultsTranscriptTimestamp;
6657+
}
6658+
6659+
/**
6660+
* The speaking time from the beginning of the transcript to the end.
6661+
*/
6662+
export interface EnrichedResultsTranscriptTimestamp {
6663+
/** The start time of a word from the transcript. The value matches the start time of a word from the
6664+
* `timestamps` array.
6665+
*/
6666+
from?: number;
6667+
/** The end time of a word from the transcript. The value matches the end time of a word from the `timestamps`
6668+
* array.
6669+
*/
6670+
to?: number;
6671+
}
6672+
64886673
/**
64896674
* Information about a grammar from a custom language model.
64906675
*/
@@ -6548,6 +6733,42 @@ namespace SpeechToTextV1 {
65486733
confidence: number;
65496734
}
65506735

6736+
/**
6737+
* Language detection results.
6738+
*/
6739+
export interface LanguageDetectionResult {
6740+
/** An array of `LanguageInfo` objects. */
6741+
language_info?: LanguageInfo[];
6742+
}
6743+
6744+
/**
6745+
* Language detection results.
6746+
*/
6747+
export interface LanguageDetectionResults {
6748+
/** An array of `LanguageDetectionResult` objects. */
6749+
results?: LanguageDetectionResult[];
6750+
/** An index that indicates a change point in the `results` array. The service increments the index for
6751+
* additional results that it sends for new audio for the same request. All results with the same index are
6752+
* delivered at the same time. The same index can include multiple final results that are delivered with the same
6753+
* response.
6754+
*/
6755+
result_index?: number;
6756+
}
6757+
6758+
/**
6759+
* Language detection info such as confidence and language detected.
6760+
*/
6761+
export interface LanguageInfo {
6762+
/** A score that indicates the service's confidence in its identification of the language in the range of 0.0 to
6763+
* 1.0.
6764+
*/
6765+
confidence?: number;
6766+
/** The language detected in standard abbreviated ISO 639 format. */
6767+
language?: string;
6768+
/** The timestamp of the detected language. */
6769+
timestamp?: number;
6770+
}
6771+
65516772
/**
65526773
* Information about an existing custom language model.
65536774
*/
@@ -6880,8 +7101,7 @@ namespace SpeechToTextV1 {
68807101
* * If `false`, the results are interim. They can be updated with further interim results until final results are
68817102
* eventually sent.
68827103
*
6883-
* **Note:** Because `final` is a reserved word in Java and Swift, the field is renamed `xFinal` in Java and is
6884-
* escaped with back quotes in Swift.
7104+
* **Note:** Because `final` is a reserved word in Java, the field is renamed `xFinal` in Java.
68857105
*/
68867106
final: boolean;
68877107
/** An array of alternative transcripts. The `alternatives` array can include additional requested output such
@@ -6969,6 +7189,10 @@ namespace SpeechToTextV1 {
69697189
* In both cases, the request succeeds despite the warnings.
69707190
*/
69717191
warnings?: string[];
7192+
/** If enriched results are requested, transcription with inserted punctuation marks such as periods, commas,
7193+
* question marks, and exclamation points.
7194+
*/
7195+
enriched_results?: EnrichedResults;
69727196
}
69737197

69747198
/**

0 commit comments

Comments
 (0)