11/**
2- * (C) Copyright IBM Corp. 2025 .
2+ s * (C) Copyright IBM Corp. 2026 .
33 *
44 * Licensed under the Apache License, Version 2.0 (the "License");
55 * you may not use this file except in compliance with the License.
@@ -343,6 +343,13 @@ class SpeechToTextV1 extends BaseService {
343343 * `sad_module: 2` to increase accuracy and performance in detecting speech boundaries within the audio stream. See
344344 * [Using speech recognition
345345 * parameters](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-service-features#features-parameters).
346+ * @param {string } [params.enrichments] - Speech transcript enrichment improves readability of raw ASR transcripts by
347+ * adding punctuation (periods, commas, question marks, exclamation points) and intelligent capitalization (sentence
348+ * beginnings, proper nouns, acronyms, brand names). To enable enrichment, add the `enrichments=punctuation` parameter
349+ * to your recognition request. Supported languages include English (US, UK, Australia, India), French (France,
350+ * Canada), German, Italian, Portuguese (Brazil, Portugal), Spanish (Spain, Latin America, Argentina, Chile, Colombia,
351+ * Mexico, Peru), and Japanese. See [Speech transcript
352+ * enrichment](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-speech-transcript-enrichment).
346353 * @param {string } [params.languageCustomizationId] - The customization ID (GUID) of a custom language model that is
347354 * to be used with the recognition request. The base model of the specified custom language model must match the model
348355 * specified with the `model` parameter. You must make the request with credentials for the instance of the service
@@ -569,7 +576,7 @@ class SpeechToTextV1 extends BaseService {
569576 ) : Promise < SpeechToTextV1 . Response < SpeechToTextV1 . SpeechRecognitionResults > > {
570577 const _params = { ...params } ;
571578 const _requiredParams = [ 'audio' ] ;
572- const _validParams = [ 'audio' , 'contentType' , 'model' , 'speechBeginEvent' , 'languageCustomizationId' , 'acousticCustomizationId' , 'baseModelVersion' , 'customizationWeight' , 'inactivityTimeout' , 'keywords' , 'keywordsThreshold' , 'maxAlternatives' , 'wordAlternativesThreshold' , 'wordConfidence' , 'timestamps' , 'profanityFilter' , 'smartFormatting' , 'smartFormattingVersion' , 'speakerLabels' , 'grammarName' , 'redaction' , 'audioMetrics' , 'endOfPhraseSilenceTime' , 'splitTranscriptAtPhraseEnd' , 'speechDetectorSensitivity' , 'sadModule' , 'backgroundAudioSuppression' , 'lowLatency' , 'characterInsertionBias' , 'signal' , 'headers' ] ;
579+ const _validParams = [ 'audio' , 'contentType' , 'model' , 'speechBeginEvent' , 'enrichments' , ' languageCustomizationId', 'acousticCustomizationId' , 'baseModelVersion' , 'customizationWeight' , 'inactivityTimeout' , 'keywords' , 'keywordsThreshold' , 'maxAlternatives' , 'wordAlternativesThreshold' , 'wordConfidence' , 'timestamps' , 'profanityFilter' , 'smartFormatting' , 'smartFormattingVersion' , 'speakerLabels' , 'grammarName' , 'redaction' , 'audioMetrics' , 'endOfPhraseSilenceTime' , 'splitTranscriptAtPhraseEnd' , 'speechDetectorSensitivity' , 'sadModule' , 'backgroundAudioSuppression' , 'lowLatency' , 'characterInsertionBias' , 'signal' , 'headers' ] ;
573580 const _validationErrors = validateParams ( _params , _requiredParams , _validParams ) ;
574581 if ( _validationErrors ) {
575582 return Promise . reject ( _validationErrors ) ;
@@ -579,6 +586,7 @@ class SpeechToTextV1 extends BaseService {
579586 const query = {
580587 'model' : _params . model ,
581588 'speech_begin_event' : _params . speechBeginEvent ,
589+ 'enrichments' : _params . enrichments ,
582590 'language_customization_id' : _params . languageCustomizationId ,
583591 'acoustic_customization_id' : _params . acousticCustomizationId ,
584592 'base_model_version' : _params . baseModelVersion ,
@@ -939,6 +947,20 @@ class SpeechToTextV1 extends BaseService {
939947 * @param {number } [params.resultsTtl] - The number of minutes for which the results are to be available after the job
940948 * has finished. If not delivered via a callback, the results must be retrieved within this time. Omit the parameter
941949 * to use a time to live of one week. The parameter is valid with or without a callback URL.
950+ * @param {boolean } [params.speechBeginEvent] - If `true`, the service returns a response object `SpeechActivity`
951+ * which contains the time when a speech activity is detected in the stream. This can be used both in standard and low
952+ * latency mode. This feature enables client applications to know that some words/speech has been detected and the
953+ * service is in the process of decoding. This can be used in lieu of interim results in standard mode. Use
954+ * `sad_module: 2` to increase accuracy and performance in detecting speech boundaries within the audio stream. See
955+ * [Using speech recognition
956+ * parameters](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-service-features#features-parameters).
957+ * @param {string } [params.enrichments] - Speech transcript enrichment improves readability of raw ASR transcripts by
958+ * adding punctuation (periods, commas, question marks, exclamation points) and intelligent capitalization (sentence
959+ * beginnings, proper nouns, acronyms, brand names). To enable enrichment, add the `enrichments=punctuation` parameter
960+ * to your recognition request. Supported languages include English (US, UK, Australia, India), French (France,
961+ * Canada), German, Italian, Portuguese (Brazil, Portugal), Spanish (Spain, Latin America, Argentina, Chile, Colombia,
962+ * Mexico, Peru), and Japanese. See [Speech transcript
963+ * enrichment](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-speech-transcript-enrichment).
942964 * @param {string } [params.languageCustomizationId] - The customization ID (GUID) of a custom language model that is
943965 * to be used with the recognition request. The base model of the specified custom language model must match the model
944966 * specified with the `model` parameter. You must make the request with credentials for the instance of the service
@@ -1185,7 +1207,7 @@ class SpeechToTextV1 extends BaseService {
11851207 ) : Promise < SpeechToTextV1 . Response < SpeechToTextV1 . RecognitionJob > > {
11861208 const _params = { ...params } ;
11871209 const _requiredParams = [ 'audio' ] ;
1188- const _validParams = [ 'audio' , 'contentType' , 'model' , 'callbackUrl' , 'events' , 'userToken' , 'resultsTtl' , 'languageCustomizationId' , 'acousticCustomizationId' , 'baseModelVersion' , 'customizationWeight' , 'inactivityTimeout' , 'keywords' , 'keywordsThreshold' , 'maxAlternatives' , 'wordAlternativesThreshold' , 'wordConfidence' , 'timestamps' , 'profanityFilter' , 'smartFormatting' , 'smartFormattingVersion' , 'speakerLabels' , 'grammarName' , 'redaction' , 'processingMetrics' , 'processingMetricsInterval' , 'audioMetrics' , 'endOfPhraseSilenceTime' , 'splitTranscriptAtPhraseEnd' , 'speechDetectorSensitivity' , 'sadModule' , 'backgroundAudioSuppression' , 'lowLatency' , 'characterInsertionBias' , 'signal' , 'headers' ] ;
1210+ const _validParams = [ 'audio' , 'contentType' , 'model' , 'callbackUrl' , 'events' , 'userToken' , 'resultsTtl' , 'speechBeginEvent' , 'enrichments' , ' languageCustomizationId', 'acousticCustomizationId' , 'baseModelVersion' , 'customizationWeight' , 'inactivityTimeout' , 'keywords' , 'keywordsThreshold' , 'maxAlternatives' , 'wordAlternativesThreshold' , 'wordConfidence' , 'timestamps' , 'profanityFilter' , 'smartFormatting' , 'smartFormattingVersion' , 'speakerLabels' , 'grammarName' , 'redaction' , 'processingMetrics' , 'processingMetricsInterval' , 'audioMetrics' , 'endOfPhraseSilenceTime' , 'splitTranscriptAtPhraseEnd' , 'speechDetectorSensitivity' , 'sadModule' , 'backgroundAudioSuppression' , 'lowLatency' , 'characterInsertionBias' , 'signal' , 'headers' ] ;
11891211 const _validationErrors = validateParams ( _params , _requiredParams , _validParams ) ;
11901212 if ( _validationErrors ) {
11911213 return Promise . reject ( _validationErrors ) ;
@@ -1198,6 +1220,8 @@ class SpeechToTextV1 extends BaseService {
11981220 'events' : _params . events ,
11991221 'user_token' : _params . userToken ,
12001222 'results_ttl' : _params . resultsTtl ,
1223+ 'speech_begin_event' : _params . speechBeginEvent ,
1224+ 'enrichments' : _params . enrichments ,
12011225 'language_customization_id' : _params . languageCustomizationId ,
12021226 'acoustic_customization_id' : _params . acousticCustomizationId ,
12031227 'base_model_version' : _params . baseModelVersion ,
@@ -4185,6 +4209,70 @@ class SpeechToTextV1 extends BaseService {
41854209 } ) ,
41864210 } ;
41874211
4212+ return this . createRequest ( parameters ) ;
4213+ }
4214+ /*************************
4215+ * languageIdentification
4216+ ************************/
4217+
4218+ /**
4219+ * Spoken language identification.
4220+ *
4221+ * Detects the spoken language in audio streams. The endpoint is `/v1/detect_language` and user can optionally include
4222+ * `lid_confidence` parameter to set a custom confidence threshold for detection. The model continuously processes
4223+ * incoming audio and returns the identified language when it reaches a confidence level higher than the specified
4224+ * threshold (0.99 by default). See [Spoken language
4225+ * identification](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-speech-language-identification).
4226+ *
4227+ * @param {Object } params - The parameters to send to the service.
4228+ * @param {number } params.lidConfidence - Set a custom confidence threshold for detection.
4229+ * @param {NodeJS.ReadableStream | Buffer } params.audio - The audio to transcribe.
4230+ * @param {string } [params.contentType] - The type of the input.
4231+ * @param {OutgoingHttpHeaders } [params.headers] - Custom request headers
4232+ * @returns {Promise<SpeechToTextV1.Response<SpeechToTextV1.LanguageDetectionResults>> }
4233+ */
4234+ public detectLanguage (
4235+ params : SpeechToTextV1 . DetectLanguageParams
4236+ ) : Promise < SpeechToTextV1 . Response < SpeechToTextV1 . LanguageDetectionResults > > {
4237+ const _params = { ...params } ;
4238+ const _requiredParams = [ 'lidConfidence' , 'audio' ] ;
4239+ const _validParams = [ 'lidConfidence' , 'audio' , 'contentType' , 'signal' , 'headers' ] ;
4240+ const _validationErrors = validateParams ( _params , _requiredParams , _validParams ) ;
4241+ if ( _validationErrors ) {
4242+ return Promise . reject ( _validationErrors ) ;
4243+ }
4244+
4245+ const body = _params . audio ;
4246+ const query = {
4247+ 'lid_confidence' : _params . lidConfidence ,
4248+ } ;
4249+
4250+ const sdkHeaders = getSdkHeaders ( SpeechToTextV1 . DEFAULT_SERVICE_NAME , 'v1' , 'detectLanguage' ) ;
4251+
4252+ const parameters = {
4253+ options : {
4254+ url : '/v1/detect_language' ,
4255+ method : 'POST' ,
4256+ body,
4257+ qs : query ,
4258+ } ,
4259+ defaultOptions : extend ( true , { } , this . baseOptions , {
4260+ headers : extend (
4261+ true ,
4262+ sdkHeaders ,
4263+ this . baseOptions . headers ,
4264+ {
4265+ 'Accept' : 'application/json' ,
4266+ 'Content-Type' : _params . contentType ,
4267+ } ,
4268+ _params . headers
4269+ ) ,
4270+ axiosOptions : {
4271+ signal : _params . signal ,
4272+ } ,
4273+ } ) ,
4274+ } ;
4275+
41884276 return this . createRequest ( parameters ) ;
41894277 }
41904278}
@@ -4356,6 +4444,15 @@ namespace SpeechToTextV1 {
43564444 * parameters](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-service-features#features-parameters).
43574445 */
43584446 speechBeginEvent ?: boolean ;
4447+ /** Speech transcript enrichment improves readability of raw ASR transcripts by adding punctuation (periods,
4448+ * commas, question marks, exclamation points) and intelligent capitalization (sentence beginnings, proper nouns,
4449+ * acronyms, brand names). To enable enrichment, add the `enrichments=punctuation` parameter to your recognition
4450+ * request. Supported languages include English (US, UK, Australia, India), French (France, Canada), German,
4451+ * Italian, Portuguese (Brazil, Portugal), Spanish (Spain, Latin America, Argentina, Chile, Colombia, Mexico,
4452+ * Peru), and Japanese. See [Speech transcript
4453+ * enrichment](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-speech-transcript-enrichment).
4454+ */
4455+ enrichments ?: string ;
43594456 /** The customization ID (GUID) of a custom language model that is to be used with the recognition request. The
43604457 * base model of the specified custom language model must match the model specified with the `model` parameter. You
43614458 * must make the request with credentials for the instance of the service that owns the custom model. By default,
@@ -4810,6 +4907,23 @@ namespace SpeechToTextV1 {
48104907 * week. The parameter is valid with or without a callback URL.
48114908 */
48124909 resultsTtl ?: number ;
4910+ /** If `true`, the service returns a response object `SpeechActivity` which contains the time when a speech
4911+ * activity is detected in the stream. This can be used both in standard and low latency mode. This feature enables
4912+ * client applications to know that some words/speech has been detected and the service is in the process of
4913+ * decoding. This can be used in lieu of interim results in standard mode. Use `sad_module: 2` to increase accuracy
4914+ * and performance in detecting speech boundaries within the audio stream. See [Using speech recognition
4915+ * parameters](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-service-features#features-parameters).
4916+ */
4917+ speechBeginEvent ?: boolean ;
4918+ /** Speech transcript enrichment improves readability of raw ASR transcripts by adding punctuation (periods,
4919+ * commas, question marks, exclamation points) and intelligent capitalization (sentence beginnings, proper nouns,
4920+ * acronyms, brand names). To enable enrichment, add the `enrichments=punctuation` parameter to your recognition
4921+ * request. Supported languages include English (US, UK, Australia, India), French (France, Canada), German,
4922+ * Italian, Portuguese (Brazil, Portugal), Spanish (Spain, Latin America, Argentina, Chile, Colombia, Mexico,
4923+ * Peru), and Japanese. See [Speech transcript
4924+ * enrichment](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-speech-transcript-enrichment).
4925+ */
4926+ enrichments ?: string ;
48134927 /** The customization ID (GUID) of a custom language model that is to be used with the recognition request. The
48144928 * base model of the specified custom language model must match the model specified with the `model` parameter. You
48154929 * must make the request with credentials for the instance of the service that owns the custom model. By default,
@@ -6096,6 +6210,39 @@ namespace SpeechToTextV1 {
60966210 customerId : string ;
60976211 }
60986212
6213+ /** Parameters for the `detectLanguage` operation. */
6214+ export interface DetectLanguageParams extends DefaultParams {
6215+ /** Set a custom confidence threshold for detection. */
6216+ lidConfidence : number ;
6217+ /** The audio to transcribe. */
6218+ audio : NodeJS . ReadableStream | Buffer ;
6219+ /** The type of the input. */
6220+ contentType ?: DetectLanguageConstants . ContentType | string ;
6221+ }
6222+
6223+ /** Constants for the `detectLanguage` operation. */
6224+ export namespace DetectLanguageConstants {
6225+ /** The type of the input. */
6226+ export enum ContentType {
6227+ APPLICATION_OCTET_STREAM = 'application/octet-stream' ,
6228+ AUDIO_ALAW = 'audio/alaw' ,
6229+ AUDIO_BASIC = 'audio/basic' ,
6230+ AUDIO_FLAC = 'audio/flac' ,
6231+ AUDIO_G729 = 'audio/g729' ,
6232+ AUDIO_L16 = 'audio/l16' ,
6233+ AUDIO_MP3 = 'audio/mp3' ,
6234+ AUDIO_MPEG = 'audio/mpeg' ,
6235+ AUDIO_MULAW = 'audio/mulaw' ,
6236+ AUDIO_OGG = 'audio/ogg' ,
6237+ AUDIO_OGG_CODECS_OPUS = 'audio/ogg;codecs=opus' ,
6238+ AUDIO_OGG_CODECS_VORBIS = 'audio/ogg;codecs=vorbis' ,
6239+ AUDIO_WAV = 'audio/wav' ,
6240+ AUDIO_WEBM = 'audio/webm' ,
6241+ AUDIO_WEBM_CODECS_OPUS = 'audio/webm;codecs=opus' ,
6242+ AUDIO_WEBM_CODECS_VORBIS = 'audio/webm;codecs=vorbis' ,
6243+ }
6244+ }
6245+
60996246 /*************************
61006247 * model interfaces
61016248 ************************/
@@ -6485,6 +6632,44 @@ namespace SpeechToTextV1 {
64856632 display_as ?: string ;
64866633 }
64876634
6635+ /**
6636+ * If enriched results are requested, transcription with inserted punctuation marks such as periods, commas, question
6637+ * marks, and exclamation points.
6638+ */
6639+ export interface EnrichedResults {
6640+ /** If enriched results are requested, transcription with inserted punctuation marks such as periods, commas,
6641+ * question marks, and exclamation points.
6642+ */
6643+ transcript ?: EnrichedResultsTranscript ;
6644+ /** The status of the enriched transcription. */
6645+ status ?: string ;
6646+ }
6647+
6648+ /**
6649+ * If enriched results are requested, transcription with inserted punctuation marks such as periods, commas, question
6650+ * marks, and exclamation points.
6651+ */
6652+ export interface EnrichedResultsTranscript {
6653+ /** The transcript text. */
6654+ text ?: string ;
6655+ /** The speaking time from the beginning of the transcript to the end. */
6656+ timestamp ?: EnrichedResultsTranscriptTimestamp ;
6657+ }
6658+
6659+ /**
6660+ * The speaking time from the beginning of the transcript to the end.
6661+ */
6662+ export interface EnrichedResultsTranscriptTimestamp {
6663+ /** The start time of a word from the transcript. The value matches the start time of a word from the
6664+ * `timestamps` array.
6665+ */
6666+ from ?: number ;
6667+ /** The end time of a word from the transcript. The value matches the end time of a word from the `timestamps`
6668+ * array.
6669+ */
6670+ to ?: number ;
6671+ }
6672+
64886673 /**
64896674 * Information about a grammar from a custom language model.
64906675 */
@@ -6548,6 +6733,42 @@ namespace SpeechToTextV1 {
65486733 confidence : number ;
65496734 }
65506735
6736+ /**
6737+ * Language detection results.
6738+ */
6739+ export interface LanguageDetectionResult {
6740+ /** An array of `LanguageInfo` objects. */
6741+ language_info ?: LanguageInfo [ ] ;
6742+ }
6743+
6744+ /**
6745+ * Language detection results.
6746+ */
6747+ export interface LanguageDetectionResults {
6748+ /** An array of `LanguageDetectionResult` objects. */
6749+ results ?: LanguageDetectionResult [ ] ;
6750+ /** An index that indicates a change point in the `results` array. The service increments the index for
6751+ * additional results that it sends for new audio for the same request. All results with the same index are
6752+ * delivered at the same time. The same index can include multiple final results that are delivered with the same
6753+ * response.
6754+ */
6755+ result_index ?: number ;
6756+ }
6757+
6758+ /**
6759+ * Language detection info such as confidence and language detected.
6760+ */
6761+ export interface LanguageInfo {
6762+ /** A score that indicates the service's confidence in its identification of the language in the range of 0.0 to
6763+ * 1.0.
6764+ */
6765+ confidence ?: number ;
6766+ /** The language detected in standard abbreviated ISO 639 format. */
6767+ language ?: string ;
6768+ /** The timestamp of the detected language. */
6769+ timestamp ?: number ;
6770+ }
6771+
65516772 /**
65526773 * Information about an existing custom language model.
65536774 */
@@ -6880,8 +7101,7 @@ namespace SpeechToTextV1 {
68807101 * * If `false`, the results are interim. They can be updated with further interim results until final results are
68817102 * eventually sent.
68827103 *
6883- * **Note:** Because `final` is a reserved word in Java and Swift, the field is renamed `xFinal` in Java and is
6884- * escaped with back quotes in Swift.
7104+ * **Note:** Because `final` is a reserved word in Java, the field is renamed `xFinal` in Java.
68857105 */
68867106 final : boolean ;
68877107 /** An array of alternative transcripts. The `alternatives` array can include additional requested output such
@@ -6969,6 +7189,10 @@ namespace SpeechToTextV1 {
69697189 * In both cases, the request succeeds despite the warnings.
69707190 */
69717191 warnings ?: string [ ] ;
7192+ /** If enriched results are requested, transcription with inserted punctuation marks such as periods, commas,
7193+ * question marks, and exclamation points.
7194+ */
7195+ enriched_results ?: EnrichedResults ;
69727196 }
69737197
69747198 /**
0 commit comments