From dc67c31d22a945ec7a36e5346f22af77057049d5 Mon Sep 17 00:00:00 2001
From: kolubex <darurlakshmipathibalaji@gmail.com>
Date: Thu, 4 Jul 2024 21:50:55 +0000
Subject: [PATCH 1/4] added_base_files_for_av

---
 balu_codes/transcribe.py                      |   13 +
 .../asr/data/audio_to_text_dataset.py         |   62 +
 nemo/collections/asr/data/av_to_text.py       |  658 ++++++++
 .../asr/models/av_ctc_bpe_models.py           |  656 ++++++++
 nemo/collections/asr/models/av_ctc_models.py  |  876 ++++++++++
 .../common/parts/preprocessing/collections.py |  329 +++-
 scripts/tokenizers/sentencepiece_model_pb2.py |  764 +++++++++
 tools/nemo_forced_aligner/align.py            |    2 +-
 .../asr/ASR_CTC_Language_Finetuning.ipynb     | 1144 +++++++++----
 .../asr/asr_adapters/ASR_with_Adapters.ipynb  | 1485 ++++++++++++-----
 10 files changed, 5194 insertions(+), 795 deletions(-)
 create mode 100644 balu_codes/transcribe.py
 create mode 100644 nemo/collections/asr/data/av_to_text.py
 create mode 100644 nemo/collections/asr/models/av_ctc_bpe_models.py
 create mode 100644 nemo/collections/asr/models/av_ctc_models.py
 create mode 100644 scripts/tokenizers/sentencepiece_model_pb2.py

diff --git a/balu_codes/transcribe.py b/balu_codes/transcribe.py
new file mode 100644
index 000000000000..0decf8696802
--- /dev/null
+++ b/balu_codes/transcribe.py
@@ -0,0 +1,13 @@
+# import nemo.collections.asr as nemo_asr
+import sys
+import os
+sys.path.append(os.path.abspath('/workspace/nemo/NeMo-opensource/nemo/collections'))
+sys.path.append(os.path.abspath('/workspace/nemo/NeMo-opensource/nemo'))
+import asr as nemo_asr
+
+def load_model(model_name):
+    model = nemo_asr.models.ASRModel.from_pretrained(model_name)
+    return model
+
+model = load_model("stt_en_conformer_ctc_large")
+model.transcribe(["/disk1/mixed_dataset_000/mixed_audios/train/08zhZZn29jc_496fe_1997_Peters_Township_High_School_Commencement_SLASH_1997_Peters_Township_High_School_Commencement_DOT_mp3_00010.wav"])
\ No newline at end of file
diff --git a/nemo/collections/asr/data/audio_to_text_dataset.py b/nemo/collections/asr/data/audio_to_text_dataset.py
index 7ad6560b4401..25334017d792 100644
--- a/nemo/collections/asr/data/audio_to_text_dataset.py
+++ b/nemo/collections/asr/data/audio_to_text_dataset.py
@@ -25,6 +25,7 @@
 from torch.utils.data import ChainDataset
 
 from nemo.collections.asr.data import audio_to_text, audio_to_text_dali
+from nemo.collections.asr.data import av_to_text
 from nemo.collections.asr.data.huggingface.hf_audio_to_text_dataset import (
     get_hf_audio_to_text_bpe_dataset,
     get_hf_audio_to_text_char_dataset,
@@ -704,6 +705,67 @@ def get_audio_to_text_char_dataset_from_config(
             dataset = get_char_dataset(config=config, augmentor=augmentor)
     return dataset
 
+def get_av_char_dataset(config: dict, augmentor: Optional['AudioAugmentor'] = None) -> av_to_text.AVToCharDataset:
+    """
+    Instantiates a Character Encoding based AVToCharDataset.
+
+    Args:
+        config: Config of the AVToCharDataset.
+        augmentor: Optional AudioAugmentor object for augmentations on audio data.
+
+    Returns:
+        An instance of AV.
+    """
+    if 'labels' not in config:
+        logging.warning(f"dataset does not have explicitly defined labels")
+    
+    dataset = av_to_text.AVToCharDataset(
+        manifest_filepath=config['manifest_filepath'],
+        labels=config.get('labels', None),
+        sample_rate=config['sample_rate'],
+        int_values=config.get('int_values', False),
+        augmentor=augmentor,
+        max_duration=config.get('max_duration', None),
+        min_duration=config.get('min_duration', None),
+        max_utts=config.get('max_utts', 0),
+        blank_index=config.get('blank_index', -1),
+        unk_index=config.get('unk_index', -1),
+        normalize=config.get('normalize_transcripts', False),
+        trim=config.get('trim_silence', False),
+        parser=config.get('parser', 'en'),
+        return_sample_id=config.get('return_sample_id', False),
+        channel_selector=config.get('channel_selector', None),
+        video_frame_rate=config.get('video_frame_rate', 3),
+    )
+    return dataset
+    
+def get_av_to_text_char_dataset_from_config(
+    config, local_rank: int, global_rank: int, world_size: int, preprocessor_cfg: Optional[DictConfig] = None
+):
+    """
+    Construct Audio-To-Text Char dataset from a config.
+    Args:
+        config: dataset config
+        local_rank: model local rank
+        global_rank: model global rand
+        world_size: world size
+        preprocessor_cfg: preprocessor config, for DALI dataset
+
+    Returns:
+        constructed dataset or None if dataset config is invalid or nothing to load
+    """
+    
+    if 'augmentor' in config:
+        augmentor = process_augmentations(config['augmentor'], global_rank=global_rank, world_size=world_size)
+    else:
+        augmentor = None
+
+    if 'manifest_filepath' in config and config['manifest_filepath'] is None:
+        logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}")
+        return None
+    dataset = get_av_char_dataset(config=config, augmentor=augmentor)
+    return dataset
+    
 
 def get_audio_to_text_bpe_dataset_from_config(
     config,
diff --git a/nemo/collections/asr/data/av_to_text.py b/nemo/collections/asr/data/av_to_text.py
new file mode 100644
index 000000000000..066bbad340de
--- /dev/null
+++ b/nemo/collections/asr/data/av_to_text.py
@@ -0,0 +1,658 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import json
+import math
+import multiprocessing
+import os
+from collections.abc import Iterable as IterableABC
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
+
+import braceexpand
+import numpy as np
+import torch
+import webdataset as wds
+from torch.utils.data import ChainDataset
+from tqdm import tqdm
+
+from nemo.collections.asr.parts.preprocessing.features import WaveformFeaturizer
+from nemo.collections.asr.parts.preprocessing.segment import available_formats as valid_sf_formats
+from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
+from nemo.collections.common import tokenizers
+from nemo.collections.common.parts.preprocessing import collections, parsers
+from nemo.core.classes import Dataset, IterableDataset
+from nemo.core.neural_types import *
+from nemo.utils import logging
+from nemo.utils.data_utils import (
+    DataStoreObject,
+    datastore_object_get,
+    datastore_path_to_webdataset_url,
+    is_datastore_cache_shared,
+    is_datastore_path,
+    is_tarred_path,
+)
+from nemo.utils.distributed import webdataset_split_by_workers
+from nemo.utils.get_rank import is_global_rank_zero
+
+import numpy as np
+
+__all__ = [
+    'AVToCharDataset',
+    'AVToBPEDataset',
+]
+
+VALID_FILE_FORMATS = ';'.join(
+    ['wav', 'mp3', 'flac', 'opus'] + [fmt.lower() for fmt in valid_sf_formats.keys()])
+
+
+def _speech_collate_fn(batch, pad_id):
+    """collate batch of audio sig, audio len, video sig, tokens, tokens len
+    Args:
+        batch (Optional[FloatTensor], Optional[LongTensor], Optional[LongTensor],
+               LongTensor, LongTensor):  A tuple of tuples of signal, signal lengths,
+               encoded tokens, and encoded tokens length.  This collate func
+               assumes the signals are 1d torch tensors (i.e. mono audio).
+    """
+    packed_batch = list(zip(*batch))
+    if len(packed_batch) == 6:
+        _, audio_lengths, _, _, tokens_lengths, sample_ids = packed_batch
+    elif len(packed_batch) == 5:
+        sample_ids = None
+        _, audio_lengths, _, _, tokens_lengths = packed_batch
+    else:
+        raise ValueError("Expects 5 or 6 tensors in the batch!")
+    max_audio_len = 0
+    has_audio = audio_lengths[0] is not None
+    if has_audio:
+        max_audio_len = max(audio_lengths).item()
+    max_tokens_len = max(tokens_lengths).item()
+
+    audio_signal, tokens, video_feat_signal = [], [], []
+    for b in batch:
+        if len(b) == 6:
+            sig, sig_len, video_feat, tokens_i, tokens_i_len, _ = b
+        else:
+            sig, sig_len, video_feat, tokens_i, tokens_i_len = b
+        if has_audio:
+            sig_len = sig_len.item()
+            if sig_len < max_audio_len:
+                pad = (0, max_audio_len - sig_len)
+                sig = torch.nn.functional.pad(sig, pad)
+            audio_signal.append(sig)
+        video_feat_signal.append(video_feat)
+        tokens_i_len = tokens_i_len.item()
+        if tokens_i_len < max_tokens_len:
+            pad = (0, max_tokens_len - tokens_i_len)
+            tokens_i = torch.nn.functional.pad(tokens_i, pad, value=pad_id)
+        tokens.append(tokens_i)
+
+    if has_audio:
+        audio_signal = torch.stack(audio_signal)
+        audio_lengths = torch.stack(audio_lengths)
+    else:
+        audio_signal, audio_lengths = None, None
+    video_feat_signal = torch.stack(video_feat_signal)
+    tokens = torch.stack(tokens)
+    tokens_lengths = torch.stack(tokens_lengths)
+    if sample_ids is None:
+        return audio_signal, audio_lengths, tokens, tokens_lengths
+    else:
+        sample_ids = torch.tensor(sample_ids, dtype=torch.int32)
+        return audio_signal, audio_lengths, video_feat_signal, tokens, tokens_lengths, sample_ids
+
+
+class ASR_AV_ManifestProcessor:
+    """
+    Class that processes a manifest json file containing paths to audio files, transcripts, and durations (in seconds).
+    Each new line is a different sample. Example below:
+    {"audio_filepath": "/path/to/audio.wav", "text_filepath": "/path/to/audio.txt", "duration": 23.147}
+    ...
+    {"audio_filepath": "/path/to/audio.wav", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt":
+    "utterance_id", "ctm_utt": "en_4156", "side": "A", "video_featpath": "/path/to/video_feat.npy"}
+    Args:
+        manifest_filepath: Path to manifest json as described above. Can be comma-separated paths.
+        parser: Str for a language specific preprocessor or a callable.
+        max_duration: If audio exceeds this length, do not include in dataset.
+        min_duration: If audio is less than this length, do not include in dataset.
+        max_utts: Limit number of utterances.
+        bos_id: Id of beginning of sequence symbol to append if not None.
+        eos_id: Id of end of sequence symbol to append if not None.
+        pad_id: Id of pad symbol. Defaults to 0.
+    """
+
+    def __init__(
+        self,
+        manifest_filepath: str,
+        parser: Union[str, Callable],
+        max_duration: Optional[float] = None,
+        min_duration: Optional[float] = None,
+        max_utts: int = 0,
+        bos_id: Optional[int] = None,
+        eos_id: Optional[int] = None,
+        pad_id: int = 0,
+        index_by_file_id: bool = False,
+    ):
+        self.parser = parser
+
+        self.collection = collections.ASR_AV_AudioText(
+            manifests_files=manifest_filepath,
+            parser=parser,
+            min_duration=min_duration,
+            max_duration=max_duration,
+            max_number=max_utts,
+            index_by_file_id=index_by_file_id,
+        )
+
+        self.eos_id = eos_id
+        self.bos_id = bos_id
+        self.pad_id = pad_id
+
+    def process_text_by_id(self, index: int) -> Tuple[List[int], int]:
+        sample = self.collection[index]
+        return self.process_text_by_sample(sample)
+
+    def process_text_by_file_id(self, file_id: str) -> Tuple[List[int], int]:
+        manifest_idx = self.collection.mapping[file_id][0]
+        sample = self.collection[manifest_idx]
+        return self.process_text_by_sample(sample)
+
+    def process_text_by_sample(self, sample: collections.ASR_AV_AudioText.OUTPUT_TYPE) -> Tuple[List[int], int]:
+        t, tl = sample.text_tokens, len(sample.text_tokens)
+
+        if self.bos_id is not None:
+            t = [self.bos_id] + t
+            tl += 1
+        if self.eos_id is not None:
+            t = t + [self.eos_id]
+            tl += 1
+
+        return t, tl
+
+
+def cache_datastore_manifests(
+    manifest_filepaths: Union[str, List[str]],
+    cache_audio: bool = False,
+    shared_cache: Optional[bool] = None,
+    num_workers: Optional[int] = None,
+    max_num_workers: int = 20,
+):
+    """Cache manifests and audio from an object store.
+    It is assumed that remote manifests are using relative paths.
+
+    Args:
+        manifest_filepaths: list of paths to manifest files (list of strings or a string with `,` as separator)
+        cache_audio: If True, audio from manifest will also be cached
+        shared_cache: Optional, True if cache is shared across all nodes
+        num_workers: Optional, number of workers to be used for download
+        max_num_workers: max number of workers to be used for download, used when setting num_workers automatically
+    """
+    if isinstance(manifest_filepaths, str):
+        manifest_filepaths = manifest_filepaths.split(',')
+
+    num_datastore_manifests = sum(
+        [is_datastore_path(f) for f in manifest_filepaths])
+
+    if num_datastore_manifests > 0:
+        # Local utility function
+        def cache_data(manifest_filepaths, cache_audio, num_workers, max_num_workers):
+            """Cache manifests and audio data from object store.
+            """
+            # Determine the number of workers to use
+            if num_workers is None:
+                num_workers = os.cpu_count() - 1
+            num_workers = min(num_workers, max_num_workers)
+
+            # Process each manifest file
+            for manifest_file in manifest_filepaths:
+                # If manifest is on a data store, then cache it.
+                # Otherwise, nothing to do.
+                if is_datastore_path(manifest_file):
+                    logging.info('Cache manifest file: %s', manifest_file)
+                    cached_manifest_file = DataStoreObject(manifest_file).get()
+                    logging.info('Cached at: %s', str(cached_manifest_file))
+
+                    if cache_audio:
+                        # Each audio file from manifest will be cached.
+                        logging.info(
+                            'Cache audio from manifest file: %s', manifest_file)
+                        # Assumes that manifest is using relative paths
+                        manifest_dir = os.path.dirname(manifest_file)
+                        # Prepare all store objects
+                        audio_objects = []
+                        with open(cached_manifest_file, 'r') as f:
+                            for line in f:
+                                item = json.loads(line)
+                                store_path = os.path.join(
+                                    manifest_dir, item['audio_filepath'])
+                                audio_objects.append(
+                                    DataStoreObject(store_path=store_path))
+
+                        if num_workers is not None and num_workers > 1:
+                            logging.debug(
+                                'Using multiprocessing with num_workers: %d.', num_workers)
+                            with multiprocessing.Pool(processes=num_workers) as p:
+                                result = list(
+                                    tqdm(p.imap(datastore_object_get, audio_objects), total=len(
+                                        audio_objects))
+                                )
+                        else:
+                            logging.debug('Using a single process.')
+                            result = []
+                            for audio_object in tqdm(audio_objects):
+                                result.append(audio_object.get() is not None)
+
+                        if not all(result):
+                            raise RuntimeError(
+                                'Some files not downloaded successfully')
+                        logging.info('Caching complete')
+
+                else:
+                    # Nothing to do here
+                    logging.debug(
+                        'Manifest is not on a data store: %s', manifest_file)
+
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            logging.debug(
+                'Distributed environment is available and initialized.')
+
+            # Handle distributed environment
+            if shared_cache is None:
+                shared_cache = is_datastore_cache_shared()
+
+            if shared_cache:
+                logging.debug(
+                    'Cache is shared among nodes, cache data on global rank zero.')
+                is_rank_zero = is_global_rank_zero()
+            else:
+                logging.debug(
+                    'Cache is not shared among nodes, cache data on local rank zero.')
+                local_rank = int(os.environ.get("LOCAL_RANK", 0))
+                is_rank_zero = local_rank == 0
+
+            if is_rank_zero:
+                logging.info('Cache data from %s rank 0',
+                             'global' if shared_cache else 'local')
+                cache_data(
+                    manifest_filepaths=manifest_filepaths,
+                    cache_audio=cache_audio,
+                    num_workers=num_workers,
+                    max_num_workers=max_num_workers,
+                )
+            logging.debug('Reached barrier')
+            torch.distributed.barrier()
+
+        elif is_global_rank_zero():
+            # Handle non-distributed environment, e.g., if running on a single GPU
+            logging.warning(
+                'Torch distributed is not initialized and caching may be prone to data race conditions. '
+                'Now caching data from global rank 0. If there are other ranks and they pass this '
+                'before rank 0, errors might result.'
+            )
+            cache_data(
+                manifest_filepaths=manifest_filepaths,
+                cache_audio=cache_audio,
+                num_workers=num_workers,
+                max_num_workers=max_num_workers,
+            )
+        else:
+            raise RuntimeError(
+                'Torch distributed is not initialized and caching on nodes other than global rank zero is disabled '
+                'to avoid race condition between different ranks. To ensure distributed environment is '
+                'initialized, please update data config to use `defer_setup = True`.'
+            )
+
+
+class _AVTextDataset(Dataset):
+    """
+    Dataset that loads tensors via a json file containing paths to audio files, transcripts, and durations (in seconds).
+    Each new line is a different sample. Example below:
+    {"audio_filepath": "/path/to/audio.wav", "text_filepath": "/path/to/audio.txt", "duration": 23.147}
+    ...
+    {"audio_filepath": "/path/to/audio.wav", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt":
+    "utterance_id", "ctm_utt": "en_4156", "side": "A"}
+    Args:
+        manifest_filepath: Path to manifest json as described above. Can be comma-separated paths.
+        parser: Str for a language specific preprocessor or a callable.
+        sample_rate (int): Sample rate to resample loaded audio to
+        int_values (bool): If true, load samples as 32-bit integers. Defauts to False.
+        augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor object used to augment loaded
+            audio
+        max_duration: If audio exceeds this length, do not include in dataset
+        min_duration: If audio is less than this length, do not include in dataset
+        max_utts: Limit number of utterances
+        trim: whether or not to trim silence. Defaults to False
+        bos_id: Id of beginning of sequence symbol to append if not None
+        eos_id: Id of end of sequence symbol to append if not None
+        pad_id: Id of pad symbol. Defaults to 0
+        return_sample_id (bool): whether to return the sample_id as a part of each sample
+        channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing.
+        video_frame_rate (int): Frame rate of video, used to calculate duration of video
+    """
+
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        """Returns definitions of module output ports.
+               """
+        return {
+            'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
+            'a_sig_length': NeuralType(tuple('B'), LengthsType()),
+            'video_signal': NeuralType(('B', 'F', 'D'), ImageFeatureValue()),
+            'transcripts': NeuralType(('B', 'T'), LabelsType()),
+            'transcript_length': NeuralType(tuple('B'), LengthsType()),
+            'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+
+    def __init__(
+        self,
+        manifest_filepath: str,
+        parser: Union[str, Callable],
+        sample_rate: int,
+        int_values: bool = False,
+        augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None,
+        max_duration: Optional[int] = None,
+        min_duration: Optional[int] = None,
+        max_utts: int = 0,
+        trim: bool = False,
+        bos_id: Optional[int] = None,
+        eos_id: Optional[int] = None,
+        pad_id: int = 0,
+        return_sample_id: bool = False,
+        channel_selector: Optional[ChannelSelectorType] = None,
+        video_frame_rate: int = 3,
+    ):
+        if type(manifest_filepath) == str:
+            manifest_filepath = manifest_filepath.split(",")
+
+        # If necessary, cache manifests and audio from object store
+        # TODO: @Balu, include cache_video
+        cache_datastore_manifests(
+            manifest_filepaths=manifest_filepath, cache_audio=True)
+
+        self.manifest_processor = ASR_AV_ManifestProcessor(
+            manifest_filepath=manifest_filepath,
+            parser=parser,
+            max_duration=max_duration,
+            min_duration=min_duration,
+            max_utts=max_utts,
+            bos_id=bos_id,
+            eos_id=eos_id,
+            pad_id=pad_id,
+        )
+        self.featurizer = WaveformFeaturizer(
+            sample_rate=sample_rate, int_values=int_values, augmentor=augmentor)
+        self.trim = trim
+        self.return_sample_id = return_sample_id
+        self.channel_selector = channel_selector
+        self.video_frame_rate = video_frame_rate
+
+    def get_manifest_sample(self, sample_id):
+        return self.manifest_processor.collection[sample_id]
+
+    def __getitem__(self, index):
+        if isinstance(index, IterableABC):
+            return [self._process_sample(_index) for _index in index]
+        else:
+            return self._process_sample(index)
+
+    def _process_sample(self, index):
+        sample = self.manifest_processor.collection[index]
+        offset = sample.offset
+
+        if offset is None:
+            offset = 0
+
+        features = self.featurizer.process(
+            sample.audio_file,
+            offset=offset,
+            duration=sample.duration,
+            trim=self.trim,
+            orig_sr=sample.orig_sr,
+            channel_selector=self.channel_selector,
+        )
+        f, fl = features, torch.tensor(features.shape[0]).long()
+
+        # check if file exists
+        assert os.path.exists(
+            sample.video_featfile), f"Video feature file {sample.video_featfile} does not exist"
+        vf = np.load(sample.video_featfile)
+        # uniformly sample self.video_frame_rate frames from video at shape 0.
+        # TODO: @Balu, how would you do this, if you one frame rate then you should make many dirs with different frame rates.
+        assert vf.shape[0] == self.video_frame_rate, f"Video feature file {sample.video_featfile} has {vf.shape[0]} frame_feats, expected {self.video_frame_rate}"
+
+        t, tl = self.manifest_processor.process_text_by_sample(sample=sample)
+
+        if self.return_sample_id:
+            output = f, fl, torch.from_numpy(vf), torch.tensor(
+                t).long(), torch.tensor(tl).long(), index
+        else:
+            output = f, fl, torch.from_numpy(vf), torch.tensor(
+                t).long(), torch.tensor(tl).long()
+
+        return output
+
+    def __len__(self):
+        return len(self.manifest_processor.collection)
+
+    def _collate_fn(self, batch):
+        return _speech_collate_fn(batch, pad_id=self.manifest_processor.pad_id)
+
+
+class AVToCharDataset(_AVTextDataset):
+    """
+    Dataset that loads tensors via a json file containing paths to audio
+    files, transcripts, and durations (in seconds). Each new line is a
+    different sample. Example below:
+    {"audio_filepath": "/path/to/audio.wav", "text_filepath":
+    "/path/to/audio.txt", "duration": 23.147}
+    ...
+    {"audio_filepath": "/path/to/audio.wav", "text": "the
+    transcription", "offset": 301.75, "duration": 0.82, "utt":
+    "utterance_id", "ctm_utt": "en_4156", "side": "A"}
+
+    Args:
+        manifest_filepath: Path to manifest json as described above. Can
+            be comma-separated paths.
+        labels: String containing all the possible characters to map to
+        sample_rate (int): Sample rate to resample loaded audio to
+        int_values (bool): If true, load samples as 32-bit integers. Defauts to False.
+        augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor
+            object used to augment loaded audio
+        max_duration: If audio exceeds this length, do not include in dataset
+        min_duration: If audio is less than this length, do not include
+            in dataset
+        max_utts: Limit number of utterances
+        blank_index: blank character index, default = -1
+        unk_index: unk_character index, default = -1
+        normalize: whether to normalize transcript text (default): True
+        bos_id: Id of beginning of sequence symbol to append if not None
+        eos_id: Id of end of sequence symbol to append if not None
+        return_sample_id (bool): whether to return the sample_id as a part of each sample
+        channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing.
+        video_frame_rate (int): Frame rate of video, used to calculate duration of video
+    """
+
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        """Returns definitions of module output ports.
+               """
+        return {
+            'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
+            'a_sig_length': NeuralType(tuple('B'), LengthsType()),
+            'video_signal': NeuralType(('B', 'F', 'D'), ImageFeatureValue()),
+            'transcripts': NeuralType(('B', 'T'), LabelsType()),
+            'transcript_length': NeuralType(tuple('B'), LengthsType()),
+            'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+
+    def __init__(
+        self,
+        manifest_filepath: str,
+        labels: Union[str, List[str]],
+        sample_rate: int,
+        int_values: bool = False,
+        augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None,
+        max_duration: Optional[float] = None,
+        min_duration: Optional[float] = None,
+        max_utts: int = 0,
+        blank_index: int = -1,
+        unk_index: int = -1,
+        normalize: bool = True,
+        trim: bool = False,
+        bos_id: Optional[int] = None,
+        eos_id: Optional[int] = None,
+        pad_id: int = 0,
+        parser: Union[str, Callable] = 'en',
+        return_sample_id: bool = False,
+        channel_selector: Optional[ChannelSelectorType] = None,
+        video_frame_rate: int = 3,
+    ):
+        self.labels = labels
+
+        parser = parsers.make_parser(
+            labels=labels, name=parser, unk_id=unk_index, blank_id=blank_index, do_normalize=normalize
+        )
+
+        super().__init__(
+            manifest_filepath=manifest_filepath,
+            parser=parser,
+            sample_rate=sample_rate,
+            int_values=int_values,
+            augmentor=augmentor,
+            max_duration=max_duration,
+            min_duration=min_duration,
+            max_utts=max_utts,
+            trim=trim,
+            bos_id=bos_id,
+            eos_id=eos_id,
+            pad_id=pad_id,
+            return_sample_id=return_sample_id,
+            channel_selector=channel_selector,
+            video_frame_rate=video_frame_rate,
+        )
+
+
+class AudioToBPEDataset(_AVTextDataset):
+    """
+    Dataset that loads tensors via a json file containing paths to audio
+    files, transcripts, and durations (in seconds). Each new line is a
+    different sample. Example below:
+    {"audio_filepath": "/path/to/audio.wav", "text_filepath":
+    "/path/to/audio.txt", "duration": 23.147}
+    ...
+    {"audio_filepath": "/path/to/audio.wav", "text": "the
+    transcription", "offset": 301.75, "duration": 0.82, "utt":
+    "utterance_id", "ctm_utt": "en_4156", "side": "A"}
+
+    In practice, the dataset and manifest used for character encoding and byte pair encoding
+    are exactly the same. The only difference lies in how the dataset tokenizes the text in
+    the manifest.
+
+    Args:
+        manifest_filepath: Path to manifest json as described above. Can
+            be comma-separated paths.
+        tokenizer: A subclass of the Tokenizer wrapper found in the common collection,
+            nemo.collections.common.tokenizers.TokenizerSpec. ASR Models support a subset of
+            all available tokenizers.
+        sample_rate (int): Sample rate to resample loaded audio to
+        int_values (bool): If true, load samples as 32-bit integers. Defauts to False.
+        augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor
+            object used to augment loaded audio
+        max_duration: If audio exceeds this length, do not include in dataset
+        min_duration: If audio is less than this length, do not include
+            in dataset
+        max_utts: Limit number of utterances
+        trim: Whether to trim silence segments
+        use_start_end_token: Boolean which dictates whether to add [BOS] and [EOS]
+            tokens to beginning and ending of speech respectively.
+        return_sample_id (bool): whether to return the sample_id as a part of each sample
+        channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing.
+    """
+
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        """Returns definitions of module output ports.
+               """
+        return {
+            'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
+            'a_sig_length': NeuralType(tuple('B'), LengthsType()),
+            'transcripts': NeuralType(('B', 'T'), LabelsType()),
+            'transcript_length': NeuralType(tuple('B'), LengthsType()),
+            'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+
+    def __init__(
+        self,
+        manifest_filepath: str,
+        tokenizer: 'nemo.collections.common.tokenizers.TokenizerSpec',
+        sample_rate: int,
+        int_values: bool = False,
+        augmentor: 'nemo.collections.asr.parts.perturb.AudioAugmentor' = None,
+        max_duration: Optional[int] = None,
+        min_duration: Optional[int] = None,
+        max_utts: int = 0,
+        trim: bool = False,
+        use_start_end_token: bool = True,
+        return_sample_id: bool = False,
+        channel_selector: Optional[ChannelSelectorType] = None,
+    ):
+        if use_start_end_token and hasattr(tokenizer, "bos_id") and tokenizer.bos_id > 0:
+            bos_id = tokenizer.bos_id
+        else:
+            bos_id = None
+
+        if use_start_end_token and hasattr(tokenizer, "eos_id") and tokenizer.eos_id > 0:
+            eos_id = tokenizer.eos_id
+        else:
+            eos_id = None
+
+        if hasattr(tokenizer, "pad_id") and tokenizer.pad_id > 0:
+            pad_id = tokenizer.pad_id
+        else:
+            pad_id = 0
+
+        class TokenizerWrapper:
+            def __init__(self, tokenizer):
+                if isinstance(tokenizer, tokenizers.aggregate_tokenizer.AggregateTokenizer):
+                    self.is_aggregate = True
+                else:
+                    self.is_aggregate = False
+                self._tokenizer = tokenizer
+
+            def __call__(self, *args):
+                if isinstance(args[0], List) and self.is_aggregate:
+                    t = []
+                    for span in args[0]:
+                        t.extend(self._tokenizer.text_to_ids(
+                            span['str'], span['lang']))
+                    return t
+
+                t = self._tokenizer.text_to_ids(*args)
+                return t
+
+        super().__init__(
+            manifest_filepath=manifest_filepath,
+            parser=TokenizerWrapper(tokenizer),
+            sample_rate=sample_rate,
+            int_values=int_values,
+            augmentor=augmentor,
+            max_duration=max_duration,
+            min_duration=min_duration,
+            max_utts=max_utts,
+            bos_id=bos_id,
+            eos_id=eos_id,
+            pad_id=pad_id,
+            trim=trim,
+            return_sample_id=return_sample_id,
+            channel_selector=channel_selector,
+        )
diff --git a/nemo/collections/asr/models/av_ctc_bpe_models.py b/nemo/collections/asr/models/av_ctc_bpe_models.py
new file mode 100644
index 000000000000..7544ce50e7fe
--- /dev/null
+++ b/nemo/collections/asr/models/av_ctc_bpe_models.py
@@ -0,0 +1,656 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+from typing import Dict, List, Optional, Union
+
+import torch
+from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict
+
+from nemo.collections.asr.data import audio_to_text_dataset
+from nemo.collections.asr.data.audio_to_text import _AudioTextDataset
+from nemo.collections.asr.losses.ctc import CTCLoss
+from nemo.collections.asr.metrics.wer import WER
+from nemo.collections.asr.models.ctc_models import EncDecCTCModel
+from nemo.collections.asr.parts.mixins import ASRBPEMixin
+from nemo.collections.asr.parts.submodules.ctc_decoding import CTCBPEDecoding, CTCBPEDecodingConfig
+from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler
+from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
+from nemo.core.classes.common import PretrainedModelInfo
+from nemo.utils import logging, model_utils
+
+__all__ = ['EncDecCTCModelBPE']
+
+
+class EncDecCTCModelBPE(EncDecCTCModel, ASRBPEMixin):
+    """Encoder decoder CTC-based models with Byte Pair Encoding."""
+
+    def __init__(self, cfg: DictConfig, trainer=None):
+        # Convert to Hydra 1.0 compatible DictConfig
+        cfg = model_utils.convert_model_config_to_dict_config(cfg)
+        cfg = model_utils.maybe_update_config_version(cfg)
+
+        if 'tokenizer' not in cfg:
+            raise ValueError("`cfg` must have `tokenizer` config to create a tokenizer !")
+
+        # Setup the tokenizer
+        self._setup_tokenizer(cfg.tokenizer)
+
+        # Initialize a dummy vocabulary
+        vocabulary = self.tokenizer.tokenizer.get_vocab()
+
+        # Set the new vocabulary
+        with open_dict(cfg):
+            # sidestepping the potential overlapping tokens issue in aggregate tokenizers
+            if self.tokenizer_type == "agg":
+                cfg.decoder.vocabulary = ListConfig(vocabulary)
+            else:
+                cfg.decoder.vocabulary = ListConfig(list(vocabulary.keys()))
+
+        # Override number of classes if placeholder provided
+        num_classes = cfg.decoder["num_classes"]
+
+        if num_classes < 1:
+            logging.info(
+                "\nReplacing placeholder number of classes ({}) with actual number of classes - {}".format(
+                    num_classes, len(vocabulary)
+                )
+            )
+            cfg.decoder["num_classes"] = len(vocabulary)
+
+        super().__init__(cfg=cfg, trainer=trainer)
+
+        # Setup decoding objects
+        decoding_cfg = self.cfg.get('decoding', None)
+
+        # In case decoding config not found, use default config
+        if decoding_cfg is None:
+            decoding_cfg = OmegaConf.structured(CTCBPEDecodingConfig)
+            with open_dict(self.cfg):
+                self.cfg.decoding = decoding_cfg
+
+        self.decoding = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer)
+
+        # Setup metric with decoding strategy
+        self.wer = WER(
+            decoding=self.decoding,
+            use_cer=self._cfg.get('use_cer', False),
+            dist_sync_on_step=True,
+            log_prediction=self._cfg.get("log_prediction", False),
+        )
+
+    def _setup_dataloader_from_config(self, config: Optional[Dict]):
+        # if config.get("use_lhotse"):
+        #     return get_lhotse_dataloader_from_config(
+        #         config,
+        #         global_rank=self.global_rank,
+        #         world_size=self.world_size,
+        #         dataset=LhotseSpeechToTextBpeDataset(tokenizer=self.tokenizer),
+        #     )
+
+        dataset = audio_to_text_dataset.get_audio_to_text_bpe_dataset_from_config(
+            config=config,
+            local_rank=self.local_rank,
+            global_rank=self.global_rank,
+            world_size=self.world_size,
+            tokenizer=self.tokenizer,
+            preprocessor_cfg=self.cfg.get("preprocessor", None),
+        )
+
+        if dataset is None:
+            return None
+
+        if isinstance(dataset, AudioToBPEDALIDataset):
+            # DALI Dataset implements dataloader interface
+            return dataset
+
+        shuffle = config['shuffle']
+        if isinstance(dataset, torch.utils.data.IterableDataset):
+            shuffle = False
+
+        if hasattr(dataset, 'collate_fn'):
+            collate_fn = dataset.collate_fn
+        elif hasattr(dataset.datasets[0], 'collate_fn'):
+            # support datasets that are lists of entries
+            collate_fn = dataset.datasets[0].collate_fn
+        else:
+            # support datasets that are lists of lists
+            collate_fn = dataset.datasets[0].datasets[0].collate_fn
+
+        batch_sampler = None
+        if config.get('use_semi_sorted_batching', False):
+            if not isinstance(dataset, _AudioTextDataset):
+                raise RuntimeError(
+                    "Semi Sorted Batch sampler can be used with AudioToCharDataset or AudioToBPEDataset "
+                    f"but found dataset of type {type(dataset)}"
+                )
+            # set batch_size and batch_sampler to None to disable automatic batching
+            batch_sampler = get_semi_sorted_batch_sampler(self, dataset, config)
+            config['batch_size'] = None
+            config['drop_last'] = False
+            shuffle = False
+
+        return torch.utils.data.DataLoader(
+            dataset=dataset,
+            batch_size=config['batch_size'],
+            sampler=batch_sampler,
+            batch_sampler=None,
+            collate_fn=collate_fn,
+            drop_last=config.get('drop_last', False),
+            shuffle=shuffle,
+            num_workers=config.get('num_workers', 0),
+            pin_memory=config.get('pin_memory', False),
+        )
+
+    def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader':
+        """
+        Setup function for a temporary data loader which wraps the provided audio file.
+
+        Args:
+            config: A python dictionary which contains the following keys:
+            paths2audio_files: (a list) of paths to audio files. The files should be relatively short fragments. \
+                Recommended length per file is between 5 and 25 seconds.
+            batch_size: (int) batch size to use during inference. \
+                Bigger will result in better throughput performance but would use more memory.
+            temp_dir: (str) A temporary directory where the audio manifest is temporarily
+                stored.
+            num_workers: (int) number of workers. Depends of the batch_size and machine. \
+                0 - only the main process will load batches, 1 - one worker (not main process)
+
+        Returns:
+            A pytorch DataLoader for the given audio file(s).
+        """
+
+        if 'manifest_filepath' in config:
+            manifest_filepath = config['manifest_filepath']
+            batch_size = config['batch_size']
+        else:
+            manifest_filepath = os.path.join(config['temp_dir'], 'manifest.json')
+            batch_size = min(config['batch_size'], len(config['paths2audio_files']))
+
+        dl_config = {
+            'manifest_filepath': manifest_filepath,
+            'sample_rate': self.preprocessor._sample_rate,
+            'batch_size': batch_size,
+            'shuffle': False,
+            'num_workers': config.get('num_workers', min(batch_size, os.cpu_count() - 1)),
+            'pin_memory': True,
+            'channel_selector': config.get('channel_selector', None),
+            'use_start_end_token': self.cfg.validation_ds.get('use_start_end_token', False),
+        }
+
+        if config.get("augmentor"):
+            dl_config['augmentor'] = config.get("augmentor")
+
+        temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config))
+        return temporary_datalayer
+
+    def change_vocabulary(
+        self,
+        new_tokenizer_dir: Union[str, DictConfig],
+        new_tokenizer_type: str,
+        decoding_cfg: Optional[DictConfig] = None,
+    ):
+        """
+        Changes vocabulary of the tokenizer used during CTC decoding process.
+        Use this method when fine-tuning on from pre-trained model.
+        This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would
+        use it if you want to use pretrained encoder when fine-tuning on a data in another language, or when you'd need
+        model to learn capitalization, punctuation and/or special characters.
+
+        Args:
+            new_tokenizer_dir: Directory path to tokenizer or a config for a new tokenizer (if the tokenizer type is `agg`)
+            new_tokenizer_type: Either `agg`, `bpe` or `wpe`. `bpe` is used for SentencePiece tokenizers,
+                whereas `wpe` is used for `BertTokenizer`.
+            new_tokenizer_cfg: A config for the new tokenizer. if provided, pre-empts the dir and type
+
+        Returns: None
+
+        """
+        if isinstance(new_tokenizer_dir, DictConfig):
+            if new_tokenizer_type == 'agg':
+                new_tokenizer_cfg = new_tokenizer_dir
+            else:
+                raise ValueError(
+                    f'New tokenizer dir should be a string unless the tokenizer is `agg`, but this tokenizer type is: {new_tokenizer_type}'
+                )
+        else:
+            new_tokenizer_cfg = None
+
+        if new_tokenizer_cfg is not None:
+            tokenizer_cfg = new_tokenizer_cfg
+        else:
+            if not os.path.isdir(new_tokenizer_dir):
+                raise NotADirectoryError(
+                    f'New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}'
+                    f"New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}"
+                )
+
+            if new_tokenizer_type.lower() not in ('bpe', 'wpe'):
+                raise ValueError(f'New tokenizer type must be either `bpe` or `wpe`')
+
+            tokenizer_cfg = OmegaConf.create({'dir': new_tokenizer_dir, 'type': new_tokenizer_type})
+
+        # Setup the tokenizer
+        self._setup_tokenizer(tokenizer_cfg)
+
+        # Initialize a dummy vocabulary
+        vocabulary = self.tokenizer.tokenizer.get_vocab()
+
+        # Set the new vocabulary
+        decoder_config = copy.deepcopy(self.decoder.to_config_dict())
+        # sidestepping the potential overlapping tokens issue in aggregate tokenizers
+        if self.tokenizer_type == "agg":
+            decoder_config.vocabulary = ListConfig(vocabulary)
+        else:
+            decoder_config.vocabulary = ListConfig(list(vocabulary.keys()))
+
+        decoder_num_classes = decoder_config['num_classes']
+
+        # Override number of classes if placeholder provided
+        logging.info(
+            "\nReplacing old number of classes ({}) with new number of classes - {}".format(
+                decoder_num_classes, len(vocabulary)
+            )
+        )
+
+        decoder_config['num_classes'] = len(vocabulary)
+
+        del self.decoder
+        self.decoder = EncDecCTCModelBPE.from_config_dict(decoder_config)
+        del self.loss
+        self.loss = CTCLoss(
+            num_classes=self.decoder.num_classes_with_blank - 1,
+            zero_infinity=True,
+            reduction=self._cfg.get("ctc_reduction", "mean_batch"),
+        )
+
+        if decoding_cfg is None:
+            # Assume same decoding config as before
+            decoding_cfg = self.cfg.decoding
+
+        # Assert the decoding config with all hyper parameters
+        decoding_cls = OmegaConf.structured(CTCBPEDecodingConfig)
+        decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls))
+        decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg)
+
+        self.decoding = CTCBPEDecoding(decoding_cfg=decoding_cfg, tokenizer=self.tokenizer)
+
+        self.wer = WER(
+            decoding=self.decoding,
+            use_cer=self._cfg.get('use_cer', False),
+            log_prediction=self._cfg.get("log_prediction", False),
+            dist_sync_on_step=True,
+        )
+
+        # Update config
+        with open_dict(self.cfg.decoder):
+            self._cfg.decoder = decoder_config
+
+        with open_dict(self.cfg.decoding):
+            self._cfg.decoding = decoding_cfg
+
+        logging.info(f"Changed tokenizer to {self.decoder.vocabulary} vocabulary.")
+
+    def change_decoding_strategy(self, decoding_cfg: DictConfig):
+        """
+        Changes decoding strategy used during CTC decoding process.
+
+        Args:
+            decoding_cfg: A config for the decoder, which is optional. If the decoding type
+                needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here.
+        """
+        if decoding_cfg is None:
+            # Assume same decoding config as before
+            logging.info("No `decoding_cfg` passed when changing decoding strategy, using internal config")
+            decoding_cfg = self.cfg.decoding
+
+        # Assert the decoding config with all hyper parameters
+        decoding_cls = OmegaConf.structured(CTCBPEDecodingConfig)
+        decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls))
+        decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg)
+
+        self.decoding = CTCBPEDecoding(decoding_cfg=decoding_cfg, tokenizer=self.tokenizer,)
+
+        self.wer = WER(
+            decoding=self.decoding,
+            use_cer=self.wer.use_cer,
+            log_prediction=self.wer.log_prediction,
+            dist_sync_on_step=True,
+        )
+
+        self.decoder.temperature = decoding_cfg.get('temperature', 1.0)
+
+        # Update config
+        with open_dict(self.cfg.decoding):
+            self.cfg.decoding = decoding_cfg
+
+        logging.info(f"Changed decoding strategy to \n{OmegaConf.to_yaml(self.cfg.decoding)}")
+
+    @classmethod
+    def list_available_models(cls) -> List[PretrainedModelInfo]:
+        """
+        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
+
+        Returns:
+            List of available pre-trained models.
+        """
+        results = []
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_citrinet_256",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_256/versions/1.0.0rc1/files/stt_en_citrinet_256.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_citrinet_512",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_512/versions/1.0.0rc1/files/stt_en_citrinet_512.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_citrinet_1024",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_1024/versions/1.0.0rc1/files/stt_en_citrinet_1024.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_citrinet_256_gamma_0_25",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256_gamma_0_25",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_256_gamma_0_25/versions/1.0.0/files/stt_en_citrinet_256_gamma_0_25.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_citrinet_512_gamma_0_25",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512_gamma_0_25",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_512_gamma_0_25/versions/1.0.0/files/stt_en_citrinet_512_gamma_0_25.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_citrinet_1024_gamma_0_25",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024_gamma_0_25",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_1024_gamma_0_25/versions/1.0.0/files/stt_en_citrinet_1024_gamma_0_25.nemo",
+        )
+
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_es_citrinet_512",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_citrinet_512",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_es_citrinet_512/versions/1.0.0/files/stt_es_citrinet_512.nemo",
+        )
+
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_de_citrinet_1024",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_citrinet_1024",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_de_citrinet_1024/versions/1.5.0/files/stt_de_citrinet_1024.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_fr_citrinet_1024_gamma_0_25",
+            description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_fr_citrinet_1024_gamma_0_25",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_citrinet_1024_gamma_0_25/versions/1.5/files/stt_fr_citrinet_1024_gamma_0_25.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_fr_no_hyphen_citrinet_1024_gamma_0_25",
+            description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_fr_citrinet_1024_gamma_0_25",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_citrinet_1024_gamma_0_25/versions/1.5/files/stt_fr_no_hyphen_citrinet_1024_gamma_0_25.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_es_citrinet_1024_gamma_0_25",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_citrinet_1024_gamma_0_25",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_es_citrinet_1024_gamma_0_25/versions/1.8.0/files/stt_es_citrinet_1024_gamma_0_25.nemo",
+        )
+
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_conformer_ctc_small",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_small/versions/1.6.0/files/stt_en_conformer_ctc_small.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_conformer_ctc_medium",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_medium/versions/1.6.0/files/stt_en_conformer_ctc_medium.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_conformer_ctc_large",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_large/versions/1.10.0/files/stt_en_conformer_ctc_large.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_conformer_ctc_xlarge",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_xlarge",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_xlarge/versions/1.10.0/files/stt_en_conformer_ctc_xlarge.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_squeezeformer_ctc_xsmall_ls",
+            description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_xsmall_ls",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_xsmall_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_xsmall_ls.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_squeezeformer_ctc_small_ls",
+            description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_small_ls",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_small_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_small_ls.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_squeezeformer_ctc_small_medium_ls",
+            description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_small_medium_ls",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_small_medium_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_small_medium_ls.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_squeezeformer_ctc_medium_ls",
+            description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_medium_ls",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_medium_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_medium_ls.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_squeezeformer_ctc_medium_large_ls",
+            description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_medium_large_ls",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_medium_large_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_medium_large_ls.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_squeezeformer_ctc_large_ls",
+            description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_large_ls",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_large_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_large_ls.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_conformer_ctc_small_ls",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small_ls",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_small_ls/versions/1.0.0/files/stt_en_conformer_ctc_small_ls.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_conformer_ctc_medium_ls",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium_ls",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_medium_ls/versions/1.0.0/files/stt_en_conformer_ctc_medium_ls.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_conformer_ctc_large_ls",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large_ls",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_large_ls/versions/1.0.0/files/stt_en_conformer_ctc_large_ls.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_fr_conformer_ctc_large",
+            description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_fr_conformer_ctc_large",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_conformer_ctc_large/versions/1.5.1/files/stt_fr_conformer_ctc_large.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_fr_no_hyphen_conformer_ctc_large",
+            description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_fr_conformer_ctc_large",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_conformer_ctc_large/versions/1.5.1/files/stt_fr_no_hyphen_conformer_ctc_large.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_de_conformer_ctc_large",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_conformer_ctc_large",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_de_conformer_ctc_large/versions/1.5.0/files/stt_de_conformer_ctc_large.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_es_conformer_ctc_large",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_conformer_ctc_large",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_es_conformer_ctc_large/versions/1.8.0/files/stt_es_conformer_ctc_large.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_hi_conformer_ctc_medium",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hi_conformer_ctc_medium",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_hi_conformer_ctc_medium/versions/1.6.0/files/stt_hi_conformer_ctc_medium.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_mr_conformer_ctc_medium",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_mr_conformer_ctc_medium",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_mr_conformer_ctc_medium/versions/1.6.0/files/stt_mr_conformer_ctc_medium.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_enes_conformer_ctc_large",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_enes_conformer_ctc_large",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_enes_conformer_ctc_large/versions/1.0.0/files/stt_enes_conformer_ctc_large.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_ca_conformer_ctc_large",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ca_conformer_ctc_large",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_ca_conformer_ctc_large/versions/1.11.0/files/stt_ca_conformer_ctc_large.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_rw_conformer_ctc_large",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_rw_conformer_ctc_large",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_rw_conformer_ctc_large/versions/1.11.0/files/stt_rw_conformer_ctc_large.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_enes_conformer_ctc_large_codesw",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_enes_conformer_ctc_large_codesw",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_enes_conformer_ctc_large_codesw/versions/1.0.0/files/stt_enes_conformer_ctc_large_codesw.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_be_conformer_ctc_large",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_be_conformer_ctc_large",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_be_conformer_ctc_large/versions/1.12.0/files/stt_be_conformer_ctc_large.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_hr_conformer_ctc_large",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hr_conformer_ctc_large",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_hr_conformer_ctc_large/versions/1.11.0/files/stt_hr_conformer_ctc_large.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_it_conformer_ctc_large",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_it_conformer_ctc_large",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_it_conformer_ctc_large/versions/1.13.0/files/stt_it_conformer_ctc_large.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_ru_conformer_ctc_large",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ru_conformer_ctc_large",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_ru_conformer_ctc_large/versions/1.13.0/files/stt_ru_conformer_ctc_large.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_eo_conformer_ctc_large",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_eo_conformer_ctc_large",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_eo_conformer_ctc_large/versions/1.14.0/files/stt_eo_conformer_ctc_large.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_fastconformer_ctc_large",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_large",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_ctc_large/versions/1.0.0/files/stt_en_fastconformer_ctc_large.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_fastconformer_ctc_large_ls",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_large_ls",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_ctc_large_ls/versions/1.0.0/files/stt_en_fastconformer_ctc_large_ls.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_fastconformer_ctc_xlarge",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_xlarge",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_ctc_xlarge/versions/1.20.0/files/stt_en_fastconformer_ctc_xlarge.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_fastconformer_ctc_xxlarge",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_xxlarge",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_ctc_xxlarge/versions/1.20.1/files/stt_en_fastconformer_ctc_xxlarge.nemo",
+        )
+        results.append(model)
+
+        return results
diff --git a/nemo/collections/asr/models/av_ctc_models.py b/nemo/collections/asr/models/av_ctc_models.py
new file mode 100644
index 000000000000..22f3a43f3b23
--- /dev/null
+++ b/nemo/collections/asr/models/av_ctc_models.py
@@ -0,0 +1,876 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import json
+import os
+import tempfile
+from math import ceil
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from omegaconf import DictConfig, OmegaConf, open_dict
+from pytorch_lightning import Trainer
+from tqdm.auto import tqdm
+
+from nemo.collections.asr.data import audio_to_text_dataset
+from nemo.collections.asr.data.av_to_text import _AVTextDataset
+# from nemo.collections.asr.data.audio_to_text_dali import AudioToCharDALIDataset, DALIOutputs
+# from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset
+from nemo.collections.asr.losses.ctc import CTCLoss
+from nemo.collections.asr.metrics.wer import WER
+from nemo.collections.asr.models.asr_model import ASRModel, ExportableEncDecModel
+from nemo.collections.asr.parts.mixins import ASRModuleMixin, ASRTranscriptionMixin, InterCTCMixin, TranscribeConfig
+from nemo.collections.asr.parts.mixins.transcription import GenericTranscriptionType, TranscriptionReturnType
+from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecoding, CTCDecodingConfig
+from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler
+from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
+# from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
+from nemo.collections.common.parts.preprocessing.parsers import make_parser
+from nemo.core.classes.common import PretrainedModelInfo, typecheck
+from nemo.core.classes.mixins import AccessMixin
+from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, LogprobsType, NeuralType, SpectrogramType, ImageFeatureValue
+from nemo.utils import logging
+
+__all__ = ['EncDecCTCModel']
+
+
+class EncDecCTCModel(ASRModel, ExportableEncDecModel, ASRModuleMixin, InterCTCMixin, ASRTranscriptionMixin):
+    """Base class for encoder decoder CTC-based models."""
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        # Get global rank and total number of GPU workers for IterableDataset partitioning, if applicable
+        # Global_rank and local_rank is set by LightningModule in Lightning 1.2.0
+        self.world_size = 1
+        if trainer is not None:
+            self.world_size = trainer.world_size
+
+        super().__init__(cfg=cfg.a_model, trainer=trainer)
+
+        self.a_model = EncDecCTCModel.from_pretrained(cfg.a_model_name)
+        with open_dict(self._cfg):
+            if "feat_in" not in self._cfg.decoder or (
+                not self._cfg.decoder.feat_in and hasattr(self.encoder, '_feat_out')
+            ):
+                self._cfg.decoder.feat_in = self.encoder._feat_out
+            if "feat_in" not in self._cfg.decoder or not self._cfg.decoder.feat_in:
+                raise ValueError("param feat_in of the decoder's config is not set!")
+
+            if self.cfg.decoder.num_classes < 1 and self.cfg.decoder.vocabulary is not None:
+                logging.info(
+                    "\nReplacing placeholder number of classes ({}) with actual number of classes - {}".format(
+                        self.cfg.decoder.num_classes, len(self.cfg.decoder.vocabulary)
+                    )
+                )
+                cfg.decoder["num_classes"] = len(self.cfg.decoder.vocabulary)
+
+        # initialize a transformer encoder and decoder
+        self.a_linear = torch.nn.Linear(in_features = self.a_model.decoder._feat_in, out_features = self.cfg.av_encoder.d_model)
+        self.v_linear = torch.nn.Linear(in_features = self.cfg.v_model.decoder._feat_in, out_features = self.cfg.av_encoder.d_model)
+        self.av_enocder_layer = torch.nn.TransformerEncoderLayer(d_model = self.cfg.av_encoder.d_model, nhead = self.cfg.av_encoder.nhead, dropout = self.cfg.av_encoder.dropout, batch_first=True)
+        self.av_encoder = torch.nn.TransformerEncoder(self.av_enocder_layer, num_layers = self.cfg.av_encoder.num_layers)
+        
+        # Modality embeddings
+        self.a_modal_embs = torch.nn.Embedding(1, self.cfg.av_decoder.d_model)
+        self.v_modal_embs = torch.nn.Embedding(1, self.cfg.av_decoder.d_model)
+        
+        # Trainable positional encodings
+        self.a_pos_enc = torch.nn.Embedding(10000, self.cfg.av_decoder.d_model)
+        self.v_pos_enc = torch.nn.Embedding(10000, self.cfg.av_decoder.d_model)
+        
+        # self.av_decoder_layer = torch.nn.TransformerDecoderLayer(d_model = self.cfg.av_decoder.d_model, nhead = self.cfg.av_decoder.nhead, dropout = self.cfg.av_decoder.dropout, batch_first=True)
+        # self.av_decoder = torch.nn.TransformerDecoder(self.av_decoder_layer, num_layers = self.cfg.av_decoder.num_layers)
+        # self.av_linear = torch.nn.Linear(in_features = self.cfg.av_decoder.d_model, out_features = len(self.a_model.decoder.vocabulary))
+        
+        self.decoder = EncDecCTCModel.from_config_dict(self._cfg.decoder)
+        self.loss = CTCLoss(
+            num_classes=self.decoder.num_classes_with_blank - 1,
+            zero_infinity=True,
+            reduction=self._cfg.get("ctc_reduction", "mean_batch"),
+        )
+        
+        # Setup decoding objects
+        # decoding_cfg = self.cfg.get('decoding', None)
+
+        # In case decoding config not found, use default config
+        if decoding_cfg is None:
+            decoding_cfg = OmegaConf.structured(CTCDecodingConfig)
+            with open_dict(self.cfg):
+                self.cfg.decoding = decoding_cfg
+
+        self.decoding = CTCDecoding(self.cfg.decoding, vocabulary=OmegaConf.to_container(self.decoder.vocabulary))
+
+        # Setup metric with decoding strategy
+        self.wer = WER(
+            decoding=self.decoding,
+            use_cer=self._cfg.get('use_cer', False),
+            dist_sync_on_step=True,
+            log_prediction=self._cfg.get("log_prediction", False),
+        )
+
+        # Setup optional Optimization flags
+        self.setup_optimization_flags()
+
+        # setting up interCTC loss (from InterCTCMixin)
+        self.setup_interctc(decoder_name='decoder', loss_name='loss', wer_name='wer')
+
+    def transcribe(
+        self,
+        audio: Union[str, List[str], torch.Tensor, np.ndarray],
+        batch_size: int = 4,
+        return_hypotheses: bool = False,
+        num_workers: int = 0,
+        channel_selector: Optional[ChannelSelectorType] = None,
+        augmentor: DictConfig = None,
+        verbose: bool = True,
+        override_config: Optional[TranscribeConfig] = None,
+    ) -> TranscriptionReturnType:
+        """
+        If modify this function, please remember update transcribe_partial_audio() in
+        nemo/collections/asr/parts/utils/trancribe_utils.py
+
+        Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.
+
+        Args:
+            audio: (a single or list) of paths to audio files or a np.ndarray audio array. \
+                Recommended length per file is between 5 and 25 seconds. \
+                But it is possible to pass a few hours long file if enough GPU memory is available.
+            batch_size: (int) batch size to use during inference.
+                Bigger will result in better throughput performance but would use more memory.
+            return_hypotheses: (bool) Either return hypotheses or text
+                With hypotheses can do some postprocessing like getting timestamp or rescoring
+            num_workers: (int) number of workers for DataLoader
+            channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`.
+            augmentor: (DictConfig): Augment audio samples during transcription if augmentor is applied.
+            verbose: (bool) whether to display tqdm progress bar
+            override_config: (Optional[TranscribeConfig]) override transcription config pre-defined by the user.
+                **Note**: All other arguments in the function will be ignored if override_config is passed.
+                You should call this argument as `model.transcribe(audio, override_config=TranscribeConfig(...))`.
+
+        Returns:
+            A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files
+        """
+        return super().transcribe(
+            audio=audio,
+            batch_size=batch_size,
+            return_hypotheses=return_hypotheses,
+            num_workers=num_workers,
+            channel_selector=channel_selector,
+            augmentor=augmentor,
+            verbose=verbose,
+            override_config=override_config,
+        )
+
+    def change_vocabulary(self, new_vocabulary: List[str], decoding_cfg: Optional[DictConfig] = None):
+        """
+        Changes vocabulary used during CTC decoding process. Use this method when fine-tuning on from pre-trained model.
+        This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would
+        use it if you want to use pretrained encoder when fine-tuning on a data in another language, or when you'd need
+        model to learn capitalization, punctuation and/or special characters.
+
+        If new_vocabulary == self.decoder.vocabulary then nothing will be changed.
+
+        Args:
+
+            new_vocabulary: list with new vocabulary. Must contain at least 2 elements. Typically, \
+            this is target alphabet.
+
+        Returns: None
+
+        """
+        if self.decoder.vocabulary == new_vocabulary:
+            logging.warning(f"Old {self.decoder.vocabulary} and new {new_vocabulary} match. Not changing anything.")
+        else:
+            if new_vocabulary is None or len(new_vocabulary) == 0:
+                raise ValueError(f'New vocabulary must be non-empty list of chars. But I got: {new_vocabulary}')
+            decoder_config = self.decoder.to_config_dict()
+            new_decoder_config = copy.deepcopy(decoder_config)
+            new_decoder_config['vocabulary'] = new_vocabulary
+            new_decoder_config['num_classes'] = len(new_vocabulary)
+
+            del self.decoder
+            self.decoder = EncDecCTCModel.from_config_dict(new_decoder_config)
+            del self.loss
+            self.loss = CTCLoss(
+                num_classes=self.decoder.num_classes_with_blank - 1,
+                zero_infinity=True,
+                reduction=self._cfg.get("ctc_reduction", "mean_batch"),
+            )
+
+            if decoding_cfg is None:
+                # Assume same decoding config as before
+                decoding_cfg = self.cfg.decoding
+
+            # Assert the decoding config with all hyper parameters
+            decoding_cls = OmegaConf.structured(CTCDecodingConfig)
+            decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls))
+            decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg)
+
+            self.decoding = CTCDecoding(
+                decoding_cfg=decoding_cfg, vocabulary=OmegaConf.to_container(self.decoder.vocabulary)
+            )
+
+            self.wer = WER(
+                decoding=self.decoding,
+                use_cer=self._cfg.get('use_cer', False),
+                dist_sync_on_step=True,
+                log_prediction=self._cfg.get("log_prediction", False),
+            )
+
+            # Update config
+            with open_dict(self.cfg.decoder):
+                self._cfg.decoder = new_decoder_config
+
+            with open_dict(self.cfg.decoding):
+                self.cfg.decoding = decoding_cfg
+
+            ds_keys = ['train_ds', 'validation_ds', 'test_ds']
+            for key in ds_keys:
+                if key in self.cfg:
+                    with open_dict(self.cfg[key]):
+                        self.cfg[key]['labels'] = OmegaConf.create(new_vocabulary)
+
+            logging.info(f"Changed decoder to output to {self.decoder.vocabulary} vocabulary.")
+
+    def change_decoding_strategy(self, decoding_cfg: DictConfig):
+        """
+        Changes decoding strategy used during CTC decoding process.
+
+        Args:
+            decoding_cfg: A config for the decoder, which is optional. If the decoding type
+                needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here.
+        """
+        if decoding_cfg is None:
+            # Assume same decoding config as before
+            logging.info("No `decoding_cfg` passed when changing decoding strategy, using internal config")
+            decoding_cfg = self.cfg.decoding
+
+        # Assert the decoding config with all hyper parameters
+        decoding_cls = OmegaConf.structured(CTCDecodingConfig)
+        decoding_cls = OmegaConf.create(OmegaConf.to_container(decoding_cls))
+        decoding_cfg = OmegaConf.merge(decoding_cls, decoding_cfg)
+
+        self.decoding = CTCDecoding(
+            decoding_cfg=decoding_cfg, vocabulary=OmegaConf.to_container(self.decoder.vocabulary)
+        )
+
+        self.wer = WER(
+            decoding=self.decoding,
+            use_cer=self.wer.use_cer,
+            log_prediction=self.wer.log_prediction,
+            dist_sync_on_step=True,
+        )
+
+        self.decoder.temperature = decoding_cfg.get('temperature', 1.0)
+
+        # Update config
+        with open_dict(self.cfg.decoding):
+            self.cfg.decoding = decoding_cfg
+
+        logging.info(f"Changed decoding strategy to \n{OmegaConf.to_yaml(self.cfg.decoding)}")
+
+    def _setup_dataloader_from_config(self, config: Optional[Dict]):
+        # Automatically inject args from model config to dataloader config
+        audio_to_text_dataset.inject_dataloader_value_from_model_config(self.cfg, config, key='sample_rate')
+        audio_to_text_dataset.inject_dataloader_value_from_model_config(self.cfg, config, key='labels')
+
+        dataset = audio_to_text_dataset.get_av_to_text_char_dataset_from_config(
+            config=config,
+            local_rank=self.local_rank,
+            global_rank=self.global_rank,
+            world_size=self.world_size,
+            preprocessor_cfg=self._cfg.get("preprocessor", None),
+        )
+
+        if dataset is None:
+            return None
+        
+        shuffle = config['shuffle']
+        if isinstance(dataset, torch.utils.data.IterableDataset):
+            shuffle = False
+
+        if hasattr(dataset, 'collate_fn'):
+            collate_fn = dataset.collate_fn
+        elif hasattr(dataset.datasets[0], 'collate_fn'):
+            # support datasets that are lists of entries
+            collate_fn = dataset.datasets[0].collate_fn
+        else:
+            # support datasets that are lists of lists
+            collate_fn = dataset.datasets[0].datasets[0].collate_fn
+
+        batch_sampler = None
+        if config.get('use_semi_sorted_batching', False): # This is in usable format even for our 
+            if not isinstance(dataset, _AVTextDataset):
+                raise RuntimeError(
+                    "Semi Sorted Batch sampler can be used with AudioToCharDataset or AudioToBPEDataset "
+                    f"but found dataset of type {type(dataset)}"
+                )
+            # set batch_size and batch_sampler to None to disable automatic batching
+            batch_sampler = get_semi_sorted_batch_sampler(self, dataset, config)
+            config['batch_size'] = None
+            config['drop_last'] = False
+            shuffle = False
+
+        return torch.utils.data.DataLoader(
+            dataset=dataset,
+            batch_size=config['batch_size'],
+            sampler=batch_sampler,
+            batch_sampler=None,
+            collate_fn=collate_fn,
+            drop_last=config.get('drop_last', False),
+            shuffle=shuffle,
+            num_workers=config.get('num_workers', 0),
+            pin_memory=config.get('pin_memory', False),
+        )
+
+    def setup_training_data(self, train_data_config: Optional[Union[DictConfig, Dict]]):
+        """
+        Sets up the training data loader via a Dict-like object.
+
+        Args:
+            train_data_config: A config that contains the information regarding construction
+                of an ASR Training dataset.
+
+        Supported Datasets:
+            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset`
+        """
+        if 'shuffle' not in train_data_config:
+            train_data_config['shuffle'] = True
+
+        # preserve config
+        self._update_dataset_config(dataset_name='train', config=train_data_config)
+
+        self._train_dl = self._setup_dataloader_from_config(config=train_data_config)
+
+        # Need to set this because if using an IterableDataset, the length of the dataloader is the total number
+        # of samples rather than the number of batches, and this messes up the tqdm progress bar.
+        # So we set the number of steps manually (to the correct number) to fix this.
+        if (
+            self._train_dl is not None
+            and hasattr(self._train_dl, 'dataset')
+            and isinstance(self._train_dl.dataset, torch.utils.data.IterableDataset)
+        ):
+            # We also need to check if limit_train_batches is already set.
+            # If it's an int, we assume that the user has set it to something sane, i.e. <= # training batches,
+            # and don't change it. Otherwise, adjust batches accordingly if it's a float (including 1.0).
+            if self._trainer is not None and isinstance(self._trainer.limit_train_batches, float):
+                self._trainer.limit_train_batches = int(
+                    self._trainer.limit_train_batches
+                    * ceil((len(self._train_dl.dataset) / self.world_size) / train_data_config['batch_size'])
+                )
+            elif self._trainer is None:
+                logging.warning(
+                    "Model Trainer was not set before constructing the dataset, incorrect number of "
+                    "training batches will be used. Please set the trainer and rebuild the dataset."
+                )
+
+    def setup_validation_data(self, val_data_config: Optional[Union[DictConfig, Dict]]):
+        """
+        Sets up the validation data loader via a Dict-like object.
+
+        Args:
+            val_data_config: A config that contains the information regarding construction
+                of an ASR Training dataset.
+
+        Supported Datasets:
+            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset`
+        """
+        if 'shuffle' not in val_data_config:
+            val_data_config['shuffle'] = False
+
+        # preserve config
+        self._update_dataset_config(dataset_name='validation', config=val_data_config)
+
+        self._validation_dl = self._setup_dataloader_from_config(config=val_data_config)
+
+    def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]):
+        """
+        Sets up the test data loader via a Dict-like object.
+
+        Args:
+            test_data_config: A config that contains the information regarding construction
+                of an ASR Training dataset.
+
+        Supported Datasets:
+            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToCharDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text.AudioToBPEDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text.TarredAudioToBPEDataset`
+            -   :class:`~nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset`
+        """
+        if 'shuffle' not in test_data_config:
+            test_data_config['shuffle'] = False
+
+        # preserve config
+        self._update_dataset_config(dataset_name='test', config=test_data_config)
+
+        self._test_dl = self._setup_dataloader_from_config(config=test_data_config)
+
+    @property
+    def input_types(self) -> Optional[Dict[str, NeuralType]]:
+        if hasattr(self.preprocessor, '_sample_rate'):
+            input_signal_eltype = AudioSignal(freq=self.preprocessor._sample_rate)
+        else:
+            input_signal_eltype = AudioSignal()
+        return {
+            "audio_input_signal": NeuralType(('B', 'T'), input_signal_eltype, optional=True),
+            "audio_input_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+            "video_input_signal": NeuralType(('B', 'F', 'D'), ImageFeatureValue(), optional=True),
+            "processed_signal": NeuralType(('B', 'D', 'T'), SpectrogramType(), optional=True),
+            "processed_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+            "sample_id": NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+
+    @property
+    def output_types(self) -> Optional[Dict[str, NeuralType]]:
+        return {
+            "outputs": NeuralType(('B', 'T', 'D'), LogprobsType()),
+            "encoded_lengths": NeuralType(tuple('B'), LengthsType()),
+            "greedy_predictions": NeuralType(('B', 'T'), LabelsType()),
+        }
+
+    @typecheck()
+    def forward(
+        self, audio_input=None, audio_input_signal_length=None, video_input_signal= None, processed_signal=None, processed_signal_length=None
+    ):
+        """
+        Forward pass of the model.
+
+        Args:
+            audio_input: Tensor that represents a batch of raw audio signals,
+                of shape [B, T]. T here represents timesteps, with 1 second of audio represented as
+                `self.sample_rate` number of floating point values.
+            audio_input_signal_length: Vector of length B, that contains the individual lengths of the audio
+                sequences.
+            processed_signal: Tensor that represents a batch of processed audio signals,
+                of shape (B, D, T) that has undergone processing via some DALI preprocessor.
+            processed_signal_length: Vector of length B, that contains the individual lengths of the
+                processed audio sequences.
+
+        Returns:
+            A tuple of 3 elements -
+            1) The log probabilities tensor of shape [B, T, D].
+            2) The lengths of the acoustic sequence after propagation through the encoder, of shape [B].
+            3) The greedy token predictions of the model of shape [B, T] (via argmax)
+        """
+        has_input_signal = audio_input is not None and audio_input_signal_length is not None
+        has_processed_signal = processed_signal is not None and processed_signal_length is not None
+        if (has_input_signal ^ has_processed_signal) == False:
+            raise ValueError(
+                f"{self} Arguments ``audio_input`` and ``audio_input_signal_length`` are mutually exclusive "
+                " with ``processed_signal`` and ``processed_signal_len`` arguments."
+            )
+
+        if not has_processed_signal:
+            processed_signal, processed_signal_length = self.preprocessor(
+                audio_input=audio_input, length=audio_input_signal_length,
+            )
+
+        if self.spec_augmentation is not None and self.training:
+            processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length)
+
+        encoder_output = self.encoder(audio_signal=processed_signal, length=processed_signal_length)
+        encoded = encoder_output[0]
+        encoded_len = encoder_output[1]
+        a_encoded = self.a_linear(encoded)
+        v_encoded = self.v_linear(video_input_signal)
+        
+        # Add modality embeddings
+        B, T, C = a_encoded.size()
+        B, F, D = v_encoded.size()
+        assert C == D, "The audio and video features must have the same dimensionality"
+        a_encoded = a_encoded + self.a_modal_embs.to(a_encoded.device).repeat(B, T, 1)
+        v_encoded = v_encoded + self.v_modal_embs.to(v_encoded.device).repeat(B, F, 1)
+        
+        # Add positional encodings
+        a_encoded = a_encoded + self.a_pos_enc.to(a_encoded.device).repeat(B, T, 1)
+        v_encoded = v_encoded + self.v_pos_enc.to(v_encoded.device).repeat(B, F, 1)
+        
+        # Concat and pass them through the transformer encoder
+        av_encoded = torch.cat((a_encoded, v_encoded), dim=1)
+        av_encoded = self.av_encoder(av_encoded)
+        
+        log_probs = self.decoder(encoder_output=av_encoded)
+        greedy_predictions = log_probs.argmax(dim=-1, keepdim=False)
+
+        return (
+            log_probs,
+            encoded_len,
+            greedy_predictions,
+        )
+
+    # PTL-specific methods
+    def training_step(self, batch, batch_nb):
+        # Reset access registry
+        if AccessMixin.is_access_enabled(self.model_guid):
+            AccessMixin.reset_registry(self)
+
+        if self.is_interctc_enabled():
+            AccessMixin.set_access_enabled(access_enabled=True, guid=self.model_guid)
+
+        signal, signal_len, video_input_signal, transcript, transcript_len = batch
+        # if isinstance(batch, DALIOutputs) and batch.has_processed_signal:
+        #     log_probs, encoded_len, predictions = self.forward(
+        #         processed_signal=signal, processed_signal_length=signal_len
+        #     )
+        # else:
+        log_probs, encoded_len, predictions = self.forward(audio_input=signal, audio_input_signal_length=signal_len, video_input_signal=video_input_signal)
+
+        if hasattr(self, '_trainer') and self._trainer is not None:
+            log_every_n_steps = self._trainer.log_every_n_steps
+        else:
+            log_every_n_steps = 1
+
+        loss_value = self.loss(
+            log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len
+        )
+
+        # Add auxiliary losses, if registered
+        loss_value = self.add_auxiliary_losses(loss_value)
+        # only computing WER when requested in the logs (same as done for final-layer WER below)
+        loss_value, tensorboard_logs = self.add_interctc_losses(
+            loss_value, transcript, transcript_len, compute_wer=((batch_nb + 1) % log_every_n_steps == 0)
+        )
+
+        # Reset access registry
+        if AccessMixin.is_access_enabled(self.model_guid):
+            AccessMixin.reset_registry(self)
+
+        tensorboard_logs.update(
+            {
+                'train_loss': loss_value,
+                'learning_rate': self._optimizer.param_groups[0]['lr'],
+                'global_step': torch.tensor(self.trainer.global_step, dtype=torch.float32),
+            }
+        )
+
+        if (batch_nb + 1) % log_every_n_steps == 0:
+            self.wer.update(
+                predictions=log_probs,
+                targets=transcript,
+                targets_lengths=transcript_len,
+                predictions_lengths=encoded_len,
+            )
+            wer, _, _ = self.wer.compute()
+            self.wer.reset()
+            tensorboard_logs.update({'training_batch_wer': wer})
+
+        return {'loss': loss_value, 'log': tensorboard_logs}
+
+    def predict_step(self, batch, batch_idx, dataloader_idx=0):
+        signal, signal_len, video_input_signal, transcript, transcript_len, sample_id = batch
+        # if isinstance(batch, DALIOutputs) and batch.has_processed_signal:
+        #     log_probs, encoded_len, predictions = self.forward(
+        #         processed_signal=signal, processed_signal_length=signal_len
+        #     )
+        # else:
+        log_probs, encoded_len, predictions = self.forward(audio_input=signal, audio_input_signal_length=signal_len, video_input_signal=video_input_signal)
+
+        transcribed_texts, _ = self.wer.decoding.ctc_decoder_predictions_tensor(
+            decoder_outputs=log_probs, decoder_lengths=encoded_len, return_hypotheses=False,
+        )
+
+        sample_id = sample_id.cpu().detach().numpy()
+        return list(zip(sample_id, transcribed_texts))
+
+    def validation_pass(self, batch, batch_idx, dataloader_idx=0):
+        if self.is_interctc_enabled():
+            AccessMixin.set_access_enabled(access_enabled=True, guid=self.model_guid)
+
+        signal, signal_len, video_input_signal, transcript, transcript_len = batch
+        # if isinstance(batch, DALIOutputs) and batch.has_processed_signal:
+        #     log_probs, encoded_len, predictions = self.forward(
+        #         processed_signal=signal, processed_signal_length=signal_len
+        #     )
+        # else:
+        log_probs, encoded_len, predictions = self.forward(audio_input=signal, audio_input_signal_length=signal_len, video_input_signal=video_input_signal)
+
+        loss_value = self.loss(
+            log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len
+        )
+        loss_value, metrics = self.add_interctc_losses(
+            loss_value, transcript, transcript_len, compute_wer=True, log_wer_num_denom=True, log_prefix="val_",
+        )
+
+        self.wer.update(
+            predictions=log_probs, targets=transcript, targets_lengths=transcript_len, predictions_lengths=encoded_len,
+        )
+        wer, wer_num, wer_denom = self.wer.compute()
+        self.wer.reset()
+        metrics.update({'val_loss': loss_value, 'val_wer_num': wer_num, 'val_wer_denom': wer_denom, 'val_wer': wer})
+
+        self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32))
+
+        # Reset access registry
+        if AccessMixin.is_access_enabled(self.model_guid):
+            AccessMixin.reset_registry(self)
+
+        return metrics
+
+    def validation_step(self, batch, batch_idx, dataloader_idx=0):
+        metrics = self.validation_pass(batch, batch_idx, dataloader_idx)
+        if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1:
+            self.validation_step_outputs[dataloader_idx].append(metrics)
+        else:
+            self.validation_step_outputs.append(metrics)
+        return metrics
+
+    def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0):
+        metrics = super().multi_validation_epoch_end(outputs, dataloader_idx)
+        self.finalize_interctc_metrics(metrics, outputs, prefix="val_")
+        return metrics
+
+    def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0):
+        metrics = super().multi_test_epoch_end(outputs, dataloader_idx)
+        self.finalize_interctc_metrics(metrics, outputs, prefix="test_")
+        return metrics
+
+    def test_step(self, batch, batch_idx, dataloader_idx=0):
+        logs = self.validation_pass(batch, batch_idx, dataloader_idx=dataloader_idx)
+        test_logs = {name.replace("val_", "test_"): value for name, value in logs.items()}
+        if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1:
+            self.test_step_outputs[dataloader_idx].append(test_logs)
+        else:
+            self.test_step_outputs.append(test_logs)
+        return test_logs
+
+    def test_dataloader(self):
+        if self._test_dl is not None:
+            return self._test_dl
+
+    """ Transcription related methods """
+
+    def _transcribe_on_begin(self, audio, trcfg: TranscribeConfig):
+        super()._transcribe_on_begin(audio, trcfg)
+
+        # Freeze the encoder and decoure_exder modules
+        self.encoder.freeze()
+        self.decoder.freeze()
+
+    def _transcribe_on_end(self, trcfg: TranscribeConfig):
+        super()._transcribe_on_end(trcfg)
+
+        # Unfreeze the encoder and decoder modules
+        self.encoder.unfreeze()
+        self.decoder.unfreeze()
+
+    def _transcribe_forward(self, batch: Any, trcfg: TranscribeConfig):
+        logits, logits_len, greedy_predictions = self.forward(audio_input=batch[0], audio_input_signal_length=batch[1], video_input_signal=batch[2])
+        output = dict(logits=logits, logits_len=logits_len)
+        del greedy_predictions
+        return output
+
+    def _transcribe_output_processing(self, outputs, trcfg: TranscribeConfig) -> GenericTranscriptionType:
+        logits = outputs.pop('logits')
+        logits_len = outputs.pop('logits_len')
+
+        current_hypotheses, all_hyp = self.decoding.ctc_decoder_predictions_tensor(
+            logits, decoder_lengths=logits_len, return_hypotheses=trcfg.return_hypotheses,
+        )
+        if trcfg.return_hypotheses:
+            if logits.is_cuda:
+                # See comment in
+                # ctc_greedy_decoding.py::GreedyCTCInfer::forward() to
+                # understand this idiom.
+                logits_cpu = torch.empty(logits.shape, dtype=logits.dtype, device=torch.device("cpu"), pin_memory=True)
+                logits_cpu.copy_(logits, non_blocking=True)
+            else:
+                logits_cpu = logits
+            logits_len = logits_len.cpu()
+            # dump log probs per file
+            for idx in range(logits_cpu.shape[0]):
+                current_hypotheses[idx].y_sequence = logits_cpu[idx][: logits_len[idx]]
+                if current_hypotheses[idx].alignments is None:
+                    current_hypotheses[idx].alignments = current_hypotheses[idx].y_sequence
+            del logits_cpu
+
+        # cleanup memory
+        del logits, logits_len
+
+        hypotheses = []
+        if all_hyp is None:
+            hypotheses += current_hypotheses
+        else:
+            hypotheses += all_hyp
+
+        return hypotheses
+
+    def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLoader':
+        """
+        Setup function for a temporary data loader which wraps the provided audio file.
+
+        Args:
+            config: A python dictionary which contains the following keys:
+            paths2audio_files: (a list) of paths to audio files. The files should be relatively short fragments. \
+                Recommended length per file is between 5 and 25 seconds.
+            batch_size: (int) batch size to use during inference. \
+                Bigger will result in better throughput performance but would use more memory.
+            temp_dir: (str) A temporary directory where the audio manifest is temporarily
+                stored.
+            num_workers: (int) number of workers. Depends of the batch_size and machine. \
+                0 - only the main process will load batches, 1 - one worker (not main process)
+
+        Returns:
+            A pytorch DataLoader for the given audio file(s).
+        """
+        if 'manifest_filepath' in config:
+            manifest_filepath = config['manifest_filepath']
+            batch_size = config['batch_size']
+        else:
+            manifest_filepath = os.path.join(config['temp_dir'], 'manifest.json')
+            batch_size = min(config['batch_size'], len(config['paths2audio_files']))
+
+        dl_config = {
+            'manifest_filepath': manifest_filepath,
+            'sample_rate': self.preprocessor._sample_rate,
+            'labels': OmegaConf.to_container(self.decoder.vocabulary),
+            'batch_size': batch_size,
+            'trim_silence': False,
+            'shuffle': False,
+            'num_workers': config.get('num_workers', min(batch_size, os.cpu_count() - 1)),
+            'pin_memory': True,
+            'channel_selector': config.get('channel_selector', None),
+        }
+        if config.get("augmentor"):
+            dl_config['augmentor'] = config.get("augmentor")
+
+        temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config))
+        return temporary_datalayer
+
+    @classmethod
+    def list_available_models(cls) -> List[PretrainedModelInfo]:
+        """
+        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
+
+        Returns:
+            List of available pre-trained models.
+        """
+        results = []
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="QuartzNet15x5Base-En",
+            description="QuartzNet15x5 model trained on six datasets: LibriSpeech, Mozilla Common Voice (validated clips from en_1488h_2019-12-10), WSJ, Fisher, Switchboard, and NSC Singapore English. It was trained with Apex/Amp optimization level O1 for 600 epochs. The model achieves a WER of 3.79% on LibriSpeech dev-clean, and a WER of 10.05% on dev-other. Please visit https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels for further details.",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemospeechmodels/versions/1.0.0a5/files/QuartzNet15x5Base-En.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_quartznet15x5",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_quartznet15x5",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_quartznet15x5/versions/1.0.0rc1/files/stt_en_quartznet15x5.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_jasper10x5dr",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_jasper10x5dr",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_jasper10x5dr/versions/1.0.0rc1/files/stt_en_jasper10x5dr.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_ca_quartznet15x5",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ca_quartznet15x5",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_ca_quartznet15x5/versions/1.0.0rc1/files/stt_ca_quartznet15x5.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_it_quartznet15x5",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_it_quartznet15x5",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_it_quartznet15x5/versions/1.0.0rc1/files/stt_it_quartznet15x5.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_fr_quartznet15x5",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_quartznet15x5",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_quartznet15x5/versions/1.0.0rc1/files/stt_fr_quartznet15x5.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_es_quartznet15x5",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_quartznet15x5",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_es_quartznet15x5/versions/1.0.0rc1/files/stt_es_quartznet15x5.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_de_quartznet15x5",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_quartznet15x5",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_de_quartznet15x5/versions/1.0.0rc1/files/stt_de_quartznet15x5.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_pl_quartznet15x5",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_pl_quartznet15x5",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_pl_quartznet15x5/versions/1.0.0rc1/files/stt_pl_quartznet15x5.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_ru_quartznet15x5",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ru_quartznet15x5",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_ru_quartznet15x5/versions/1.0.0rc1/files/stt_ru_quartznet15x5.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_zh_citrinet_512",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_512",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_zh_citrinet_512/versions/1.0.0rc1/files/stt_zh_citrinet_512.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_zh_citrinet_1024_gamma_0_25",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_1024_gamma_0_25",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_zh_citrinet_1024_gamma_0_25/versions/1.0.0/files/stt_zh_citrinet_1024_gamma_0_25.nemo",
+        )
+
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_zh_citrinet_1024_gamma_0_25",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_1024_gamma_0_25",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_zh_citrinet_1024_gamma_0_25/versions/1.0.0/files/stt_zh_citrinet_1024_gamma_0_25.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="asr_talknet_aligner",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:asr_talknet_aligner",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/asr_talknet_aligner/versions/1.0.0rc1/files/qn5x5_libri_tts_phonemes.nemo",
+        )
+        results.append(model)
+
+        return results
+
+    @property
+    def wer(self):
+        return self._wer
+
+    @wer.setter
+    def wer(self, wer):
+        self._wer = wer
diff --git a/nemo/collections/common/parts/preprocessing/collections.py b/nemo/collections/common/parts/preprocessing/collections.py
index 66def034400f..03a862a99ec5 100644
--- a/nemo/collections/common/parts/preprocessing/collections.py
+++ b/nemo/collections/common/parts/preprocessing/collections.py
@@ -163,7 +163,8 @@ def __init__(
                         # elif hasattr(parser, "lang") and parser.lang is not None:
                         #    text_tokens = parser(text, parser.lang)
                         else:
-                            raise ValueError("lang required in manifest when using aggregate tokenizers")
+                            raise ValueError(
+                                "lang required in manifest when using aggregate tokenizers")
                     else:
                         text_tokens = parser(text)
                 else:
@@ -176,7 +177,8 @@ def __init__(
 
             total_duration += duration
 
-            data.append(output_type(id_, audio_file, duration, text_tokens, offset, text, speaker, orig_sr, lang))
+            data.append(output_type(id_, audio_file, duration,
+                        text_tokens, offset, text, speaker, orig_sr, lang))
             if index_by_file_id:
                 file_id, _ = os.path.splitext(os.path.basename(audio_file))
                 if file_id not in self.mapping:
@@ -189,12 +191,133 @@ def __init__(
 
         if do_sort_by_duration:
             if index_by_file_id:
-                logging.warning("Tried to sort dataset by duration, but cannot since index_by_file_id is set.")
+                logging.warning(
+                    "Tried to sort dataset by duration, but cannot since index_by_file_id is set.")
             else:
                 data.sort(key=lambda entity: entity.duration)
 
-        logging.info("Dataset loaded with %d files totalling %.2f hours", len(data), total_duration / 3600)
-        logging.info("%d files were filtered totalling %.2f hours", num_filtered, duration_filtered / 3600)
+        logging.info("Dataset loaded with %d files totalling %.2f hours", len(
+            data), total_duration / 3600)
+        logging.info("%d files were filtered totalling %.2f hours",
+                     num_filtered, duration_filtered / 3600)
+
+        super().__init__(data)
+
+
+class AVText(_Collection):
+    """List of audio-transcript text correspondence with preprocessing."""
+
+    AV_OUTPUT_TYPE = collections.namedtuple(
+        typename='AVTextEntity',
+        field_names='id audio_file video_featfile duration text_tokens offset text_raw speaker orig_sr lang',
+    )
+
+    def __init__(
+        self,
+        ids: List[int],
+        audio_files: List[str],
+        video_featfiles: List[str],
+        durations: List[float],
+        texts: List[str],
+        offsets: List[str],
+        speakers: List[Optional[int]],
+        orig_sampling_rates: List[Optional[int]],
+        token_labels: List[Optional[int]],
+        langs: List[Optional[str]],
+        parser: parsers.CharParser,
+        min_duration: Optional[float] = None,
+        max_duration: Optional[float] = None,
+        max_number: Optional[int] = None,
+        do_sort_by_duration: bool = False,
+        index_by_file_id: bool = False,
+    ):
+        """Instantiates audio-text manifest with filters and preprocessing.
+
+        Args:
+            ids: List of examples positions.
+            audio_files: List of audio files.
+            video_featfiles: List of video feature files.
+            durations: List of float durations.
+            texts: List of raw text transcripts.
+            offsets: List of duration offsets or None.
+            speakers: List of optional speakers ids.
+            orig_sampling_rates: List of original sampling rates of audio files.
+            langs: List of language ids, one for eadh sample, or None.
+            parser: Instance of `CharParser` to convert string to tokens.
+            min_duration: Minimum duration to keep entry with (default: None).
+            max_duration: Maximum duration to keep entry with (default: None).
+            max_number: Maximum number of samples to collect.
+            do_sort_by_duration: True if sort samples list by duration. Not compatible with index_by_file_id.
+            index_by_file_id: If True, saves a mapping from filename base (ID) to index in data.
+        """
+
+        output_type = self.AV_OUTPUT_TYPE
+        data, duration_filtered, num_filtered, total_duration = [], 0.0, 0, 0.0
+        if index_by_file_id:
+            self.mapping = {}
+
+        for id_, audio_file, video_featfile, duration, offset, text, speaker, orig_sr, token_labels, lang in zip(
+            ids, audio_files, video_featfiles, durations, offsets, texts, speakers, orig_sampling_rates, token_labels, langs
+        ):
+            # Duration filters.
+            if min_duration is not None and duration < min_duration:
+                duration_filtered += duration
+                num_filtered += 1
+                continue
+
+            if max_duration is not None and duration > max_duration:
+                duration_filtered += duration
+                num_filtered += 1
+                continue
+
+            if token_labels is not None:
+                text_tokens = token_labels
+            else:
+                if text != '':
+                    if hasattr(parser, "is_aggregate") and parser.is_aggregate and isinstance(text, str):
+                        if lang is not None:
+                            text_tokens = parser(text, lang)
+                        # for future use if want to add language bypass to audio_to_text classes
+                        # elif hasattr(parser, "lang") and parser.lang is not None:
+                        #    text_tokens = parser(text, parser.lang)
+                        else:
+                            raise ValueError(
+                                "lang required in manifest when using aggregate tokenizers")
+                    else:
+                        text_tokens = parser(text)
+                else:
+                    text_tokens = []
+
+                if text_tokens is None:
+                    duration_filtered += duration
+                    num_filtered += 1
+                    continue
+
+            total_duration += duration
+
+            data.append(output_type(id_, audio_file, video_featfile, duration,
+                        text_tokens, offset, text, speaker, orig_sr, lang))
+            if index_by_file_id:
+                file_id, _ = os.path.splitext(os.path.basename(audio_file))
+                if file_id not in self.mapping:
+                    self.mapping[file_id] = []
+                self.mapping[file_id].append(len(data) - 1)
+
+            # Max number of entities filter.
+            if len(data) == max_number:
+                break
+
+        if do_sort_by_duration:
+            if index_by_file_id:
+                logging.warning(
+                    "Tried to sort dataset by duration, but cannot since index_by_file_id is set.")
+            else:
+                data.sort(key=lambda entity: entity.duration)
+
+        logging.info("Dataset loaded with %d files totalling %.2f hours", len(
+            data), total_duration / 3600)
+        logging.info("%d files were filtered totalling %.2f hours",
+                     num_filtered, duration_filtered / 3600)
 
         super().__init__(data)
 
@@ -271,7 +394,8 @@ def __init__(
                         if lang is not None:
                             text_tokens = parser(text, lang)
                         else:
-                            raise ValueError("lang required in manifest when using aggregate tokenizers")
+                            raise ValueError(
+                                "lang required in manifest when using aggregate tokenizers")
                     else:
                         text_tokens = parser(text)
                 else:
@@ -284,7 +408,8 @@ def __init__(
 
             total_duration += duration
 
-            data.append(output_type(id_, video_file, duration, text_tokens, offset, text, speaker, orig_sr, lang))
+            data.append(output_type(id_, video_file, duration,
+                        text_tokens, offset, text, speaker, orig_sr, lang))
             if index_by_file_id:
                 file_id, _ = os.path.splitext(os.path.basename(video_file))
                 if file_id not in self.mapping:
@@ -297,12 +422,15 @@ def __init__(
 
         if do_sort_by_duration:
             if index_by_file_id:
-                logging.warning("Tried to sort dataset by duration, but cannot since index_by_file_id is set.")
+                logging.warning(
+                    "Tried to sort dataset by duration, but cannot since index_by_file_id is set.")
             else:
                 data.sort(key=lambda entity: entity.duration)
 
-        logging.info("Dataset loaded with %d files totalling %.2f hours", len(data), total_duration / 3600)
-        logging.info("%d files were filtered totalling %.2f hours", num_filtered, duration_filtered / 3600)
+        logging.info("Dataset loaded with %d files totalling %.2f hours", len(
+            data), total_duration / 3600)
+        logging.info("%d files were filtered totalling %.2f hours",
+                     num_filtered, duration_filtered / 3600)
 
         super().__init__(data)
 
@@ -343,6 +471,44 @@ def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs):
         )
 
 
+class ASR_AV_AudioText(AudioText):
+    """`AudioText` collector from asr structured json files."""
+
+    def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs):
+        """Parse lists of audio files, durations and transcripts texts.
+
+        Args:
+            manifests_files: Either single string file or list of such -
+                manifests to yield items from.
+            *args: Args to pass to `AudioText` constructor.
+            **kwargs: Kwargs to pass to `AudioText` constructor.
+        """
+
+        ids, audio_files, durations, texts, offsets, video_featfiles = (
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+        )
+        speakers, orig_srs, token_labels, langs = [], [], [], []
+        for item in manifest.item_iter(manifests_files):
+            ids.append(item['id'])
+            audio_files.append(item['audio_file'])
+            video_featfiles.append(item['video_featfile'])
+            durations.append(item['duration'])
+            texts.append(item['text'])
+            offsets.append(item['offset'])
+            speakers.append(item['speaker'])
+            orig_srs.append(item['orig_sr'])
+            token_labels.append(item['token_labels'])
+            langs.append(item['lang'])
+        super().__init__(
+            ids, audio_files, video_featfiles, durations, texts, offsets, speakers, orig_srs, token_labels, langs, *args, **kwargs
+        )
+
+
 class ASRVideoText(VideoText):
     """`VideoText` collector from cv structured json files."""
 
@@ -382,7 +548,8 @@ def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs):
 class SpeechLabel(_Collection):
     """List of audio-label correspondence with preprocessing."""
 
-    OUTPUT_TYPE = collections.namedtuple(typename='SpeechLabelEntity', field_names='audio_file duration label offset',)
+    OUTPUT_TYPE = collections.namedtuple(
+        typename='SpeechLabelEntity', field_names='audio_file duration label offset',)
 
     def __init__(
         self,
@@ -438,14 +605,18 @@ def __init__(
 
         if do_sort_by_duration:
             if index_by_file_id:
-                logging.warning("Tried to sort dataset by duration, but cannot since index_by_file_id is set.")
+                logging.warning(
+                    "Tried to sort dataset by duration, but cannot since index_by_file_id is set.")
             else:
                 data.sort(key=lambda entity: entity.duration)
 
-        logging.info(f"Filtered duration for loading collection is {duration_filtered / 3600: .2f} hours.")
-        logging.info(f"Dataset loaded with {len(data)} items, total duration of {total_duration / 3600: .2f} hours.")
+        logging.info(
+            f"Filtered duration for loading collection is {duration_filtered / 3600: .2f} hours.")
+        logging.info(
+            f"Dataset loaded with {len(data)} items, total duration of {total_duration / 3600: .2f} hours.")
         self.uniq_labels = sorted(set(map(lambda x: x.label, data)))
-        logging.info("# {} files loaded accounting to # {} labels".format(len(data), len(self.uniq_labels)))
+        logging.info("# {} files loaded accounting to # {} labels".format(
+            len(data), len(self.uniq_labels)))
 
         super().__init__(data)
 
@@ -502,12 +673,15 @@ def __parse_item(self, line: str, manifest_file: str) -> Dict[str, Any]:
         elif 'audio_filepath' in item:
             item['audio_file'] = item.pop('audio_filepath')
         else:
-            raise ValueError(f"Manifest file has invalid json line structure: {line} without proper audio file key.")
-        item['audio_file'] = manifest.get_full_path(audio_file=item['audio_file'], manifest_file=manifest_file)
+            raise ValueError(
+                f"Manifest file has invalid json line structure: {line} without proper audio file key.")
+        item['audio_file'] = manifest.get_full_path(
+            audio_file=item['audio_file'], manifest_file=manifest_file)
 
         # Duration.
         if 'duration' not in item:
-            raise ValueError(f"Manifest file has invalid json line structure: {line} without proper duration key.")
+            raise ValueError(
+                f"Manifest file has invalid json line structure: {line} without proper duration key.")
 
         # Label.
         if 'command' in item:
@@ -517,7 +691,8 @@ def __parse_item(self, line: str, manifest_file: str) -> Dict[str, Any]:
         elif 'label' in item:
             pass
         else:
-            raise ValueError(f"Manifest file has invalid json line structure: {line} without proper label key.")
+            raise ValueError(
+                f"Manifest file has invalid json line structure: {line} without proper label key.")
 
         item = dict(
             audio_file=item['audio_file'],
@@ -532,7 +707,8 @@ def __parse_item(self, line: str, manifest_file: str) -> Dict[str, Any]:
 class FeatureSequenceLabel(_Collection):
     """List of feature sequence of label correspondence with preprocessing."""
 
-    OUTPUT_TYPE = collections.namedtuple(typename='FeatureSequenceLabelEntity', field_names='feature_file seq_label',)
+    OUTPUT_TYPE = collections.namedtuple(
+        typename='FeatureSequenceLabelEntity', field_names='feature_file seq_label',)
 
     def __init__(
         self,
@@ -562,7 +738,8 @@ def __init__(
 
         for feature_file, seq_label in zip(feature_files, seq_labels):
 
-            label_tokens, uniq_labels_in_seq = self.relative_speaker_parser(seq_label)
+            label_tokens, uniq_labels_in_seq = self.relative_speaker_parser(
+                seq_label)
 
             data.append(output_type(feature_file, label_tokens))
             self.uniq_labels |= uniq_labels_in_seq
@@ -579,7 +756,8 @@ def __init__(
             if len(data) == max_number:
                 break
 
-        logging.info("# {} files loaded including # {} unique labels".format(len(data), len(self.uniq_labels)))
+        logging.info("# {} files loaded including # {} unique labels".format(
+            len(data), len(self.uniq_labels)))
         super().__init__(data)
 
     def relative_speaker_parser(self, seq_label):
@@ -616,7 +794,6 @@ class ASRFeatureSequenceLabel(FeatureSequenceLabel):
     def __init__(
         self, manifests_files: Union[str, List[str]], max_number: Optional[int] = None, index_by_file_id: bool = False,
     ):
-
         """Parse lists of feature files and sequences of labels.
 
         Args:
@@ -655,7 +832,8 @@ def _parse_item(self, line: str, manifest_file: str) -> Dict[str, Any]:
                 f"Manifest file has invalid json line " f"structure: {line} without proper seq_label key."
             )
 
-        item = dict(feature_file=item['feature_file'], seq_label=item['seq_label'],)
+        item = dict(feature_file=item['feature_file'],
+                    seq_label=item['seq_label'],)
 
         return item
 
@@ -754,14 +932,16 @@ def __init__(
 
         if do_sort_by_duration:
             if index_by_file_id:
-                logging.warning("Tried to sort dataset by duration, but cannot since index_by_file_id is set.")
+                logging.warning(
+                    "Tried to sort dataset by duration, but cannot since index_by_file_id is set.")
             else:
                 data.sort(key=lambda entity: entity.duration)
 
         logging.info(
             "Filtered duration for loading collection is %f.", duration_filtered,
         )
-        logging.info(f"Total {len(data)} session files loaded accounting to # {len(audio_files)} audio clips")
+        logging.info(
+            f"Total {len(data)} session files loaded accounting to # {len(audio_files)} audio clips")
 
         super().__init__(data)
 
@@ -821,12 +1001,15 @@ def __init__(
         for item in manifest.item_iter(manifests_files, parse_func=self.__parse_item_rttm):
             # Inference mode
             if self.pairwise_infer:
-                clus_speaker_digits = sorted(list(set([x[2] for x in clus_label_dict[item['uniq_id']]])))
+                clus_speaker_digits = sorted(
+                    list(set([x[2] for x in clus_label_dict[item['uniq_id']]])))
                 if item['rttm_file']:
                     base_scale_index = max(self.emb_dict.keys())
                     _sess_spk_dict = self.emb_dict[base_scale_index][item['uniq_id']]['mapping']
-                    sess_spk_dict = {int(v.split('_')[-1]): k for k, v in _sess_spk_dict.items()}
-                    rttm_speaker_digits = [int(v.split('_')[1]) for k, v in _sess_spk_dict.items()]
+                    sess_spk_dict = {
+                        int(v.split('_')[-1]): k for k, v in _sess_spk_dict.items()}
+                    rttm_speaker_digits = [int(v.split('_')[1])
+                                           for k, v in _sess_spk_dict.items()]
                     if self.seq_eval_mode:
                         clus_speaker_digits = rttm_speaker_digits
                 else:
@@ -838,14 +1021,17 @@ def __init__(
                 rttm_labels = []
                 with open(item['rttm_file'], 'r') as f:
                     for line in f.readlines():
-                        start, end, speaker = self.split_rttm_line(line, decimals=3)
-                        rttm_labels.append('{} {} {}'.format(start, end, speaker))
+                        start, end, speaker = self.split_rttm_line(
+                            line, decimals=3)
+                        rttm_labels.append(
+                            '{} {} {}'.format(start, end, speaker))
                 speaker_set = set()
                 for rttm_line in rttm_labels:
                     spk_str = rttm_line.split()[-1]
                     speaker_set.add(spk_str)
                 speaker_list = sorted(list(speaker_set))
-                sess_spk_dict = {key: val for key, val in enumerate(speaker_list)}
+                sess_spk_dict = {key: val for key,
+                                 val in enumerate(speaker_list)}
                 target_spks = tuple(sess_spk_dict.keys())
                 clus_speaker_digits = target_spks
                 rttm_speaker_digits = target_spks
@@ -853,7 +1039,8 @@ def __init__(
             if len(clus_speaker_digits) <= 2:
                 spk_comb_list = [(0, 1)]
             else:
-                spk_comb_list = [x for x in combinations(clus_speaker_digits, 2)]
+                spk_comb_list = [x for x in combinations(
+                    clus_speaker_digits, 2)]
 
             for target_spks in spk_comb_list:
                 audio_files.append(item['audio_file'])
@@ -923,9 +1110,11 @@ def __parse_item_rttm(self, line: str, manifest_file: str) -> Dict[str, Any]:
                 f"Manifest file has invalid json line " f"structure: {line} without proper audio file key."
             )
         item['audio_file'] = os.path.expanduser(item['audio_file'])
-        item['uniq_id'] = os.path.splitext(os.path.basename(item['audio_file']))[0]
+        item['uniq_id'] = os.path.splitext(
+            os.path.basename(item['audio_file']))[0]
         if 'duration' not in item:
-            raise ValueError(f"Manifest file has invalid json line " f"structure: {line} without proper duration key.")
+            raise ValueError(
+                f"Manifest file has invalid json line " f"structure: {line} without proper duration key.")
         item = dict(
             audio_file=item['audio_file'],
             uniq_id=item['uniq_id'],
@@ -940,7 +1129,8 @@ class Audio(_Collection):
     """Prepare a list of all audio items, filtered by duration.
     """
 
-    OUTPUT_TYPE = collections.namedtuple(typename='Audio', field_names='audio_files duration offset text')
+    OUTPUT_TYPE = collections.namedtuple(
+        typename='Audio', field_names='audio_files duration offset text')
 
     def __init__(
         self,
@@ -992,8 +1182,10 @@ def __init__(
         if do_sort_by_duration:
             data.sort(key=lambda entity: entity.duration)
 
-        logging.info("Dataset loaded with %d files totalling %.2f hours", len(data), total_duration / 3600)
-        logging.info("%d files were filtered totalling %.2f hours", num_filtered, duration_filtered / 3600)
+        logging.info("Dataset loaded with %d files totalling %.2f hours", len(
+            data), total_duration / 3600)
+        logging.info("%d files were filtered totalling %.2f hours",
+                     num_filtered, duration_filtered / 3600)
 
         super().__init__(data)
 
@@ -1033,7 +1225,8 @@ def __init__(
             offset_list.append(item['offset'])
             text_list.append(item['text'])
 
-        super().__init__(audio_files_list, duration_list, offset_list, text_list, *args, **kwargs)
+        super().__init__(audio_files_list, duration_list,
+                         offset_list, text_list, *args, **kwargs)
 
     def __parse_item(self, line: str, manifest_file: str) -> Dict[str, Any]:
         """Parse a single line from a manifest file.
@@ -1067,9 +1260,11 @@ def get_audio_file(item: Dict, manifest_key: Union[str, List[str]]):
                     elif isinstance(item_key, list):
                         audio_file += item_key
                     else:
-                        raise ValueError(f'Unexpected type {type(item_key)} of item for key {key}: {item_key}')
+                        raise ValueError(
+                            f'Unexpected type {type(item_key)} of item for key {key}: {item_key}')
             else:
-                raise ValueError(f'Unexpected type {type(manifest_key)} of manifest_key: {manifest_key}')
+                raise ValueError(
+                    f'Unexpected type {type(manifest_key)} of manifest_key: {manifest_key}')
 
             return audio_file
 
@@ -1085,21 +1280,25 @@ def get_audio_file(item: Dict, manifest_key: Union[str, List[str]]):
             # Get full path to audio file(s)
             if isinstance(audio_file, str):
                 # This dictionary entry points to a single file
-                audio_files[audio_key] = manifest.get_full_path(audio_file, manifest_file)
+                audio_files[audio_key] = manifest.get_full_path(
+                    audio_file, manifest_file)
             elif isinstance(audio_file, Iterable):
                 # This dictionary entry points to multiple files
                 # Get the files and keep the list structure for this key
-                audio_files[audio_key] = [manifest.get_full_path(f, manifest_file) for f in audio_file]
+                audio_files[audio_key] = [manifest.get_full_path(
+                    f, manifest_file) for f in audio_file]
             elif audio_file is None and audio_key.startswith('target'):
                 # For inference, we don't need the target
                 audio_files[audio_key] = None
             else:
-                raise ValueError(f'Unexpected type {type(audio_file)} of audio_file: {audio_file}')
+                raise ValueError(
+                    f'Unexpected type {type(audio_file)} of audio_file: {audio_file}')
         item['audio_files'] = audio_files
 
         # Handle duration
         if 'duration' not in item:
-            raise ValueError(f'Duration not available in line: {line}. Manifest file: {manifest_file}')
+            raise ValueError(
+                f'Duration not available in line: {line}. Manifest file: {manifest_file}')
 
         # Handle offset
         if 'offset' not in item:
@@ -1117,7 +1316,8 @@ def get_audio_file(item: Dict, manifest_key: Union[str, List[str]]):
 class FeatureLabel(_Collection):
     """List of feature sequence and their label correspondence with preprocessing."""
 
-    OUTPUT_TYPE = collections.namedtuple(typename='FeatureLabelEntity', field_names='feature_file label duration',)
+    OUTPUT_TYPE = collections.namedtuple(
+        typename='FeatureLabelEntity', field_names='feature_file label duration',)
 
     def __init__(
         self,
@@ -1172,13 +1372,17 @@ def __init__(
 
         if do_sort_by_duration:
             if index_by_file_id:
-                logging.warning("Tried to sort dataset by duration, but cannot since index_by_file_id is set.")
+                logging.warning(
+                    "Tried to sort dataset by duration, but cannot since index_by_file_id is set.")
             else:
                 data.sort(key=lambda entity: entity.duration)
 
-        logging.info(f"Filtered duration for loading collection is {duration_filtered / 2600:.2f} hours.")
-        logging.info(f"Dataset loaded with {len(data)} items, total duration of {total_duration / 3600: .2f} hours.")
-        logging.info("# {} files loaded including # {} unique labels".format(len(data), len(self.uniq_labels)))
+        logging.info(
+            f"Filtered duration for loading collection is {duration_filtered / 2600:.2f} hours.")
+        logging.info(
+            f"Dataset loaded with {len(data)} items, total duration of {total_duration / 3600: .2f} hours.")
+        logging.info("# {} files loaded including # {} unique labels".format(
+            len(data), len(self.uniq_labels)))
         super().__init__(data)
 
 
@@ -1194,7 +1398,6 @@ def __init__(
         *args,
         **kwargs,
     ):
-
         """Parse lists of feature files and sequences of labels.
 
         Args:
@@ -1236,15 +1439,18 @@ def _parse_item(self, line: str, manifest_file: str) -> Dict[str, Any]:
             raise ValueError(
                 f"Manifest file has invalid json line " f"structure: {line} without proper 'feature_file' key."
             )
-        item['feature_file'] = manifest.get_full_path(audio_file=item['feature_file'], manifest_file=manifest_file)
+        item['feature_file'] = manifest.get_full_path(
+            audio_file=item['feature_file'], manifest_file=manifest_file)
 
         # Label.
         if 'label' in item:
             item['label'] = item.pop('label')
         else:
-            raise ValueError(f"Manifest file has invalid json line structure: {line} without proper 'label' key.")
+            raise ValueError(
+                f"Manifest file has invalid json line structure: {line} without proper 'label' key.")
 
-        item = dict(feature_file=item['feature_file'], label=item['label'], duration=item['duration'])
+        item = dict(feature_file=item['feature_file'],
+                    label=item['label'], duration=item['duration'])
 
         return item
 
@@ -1332,7 +1538,8 @@ def __init__(
                         if lang is not None:
                             text_tokens = parser(text, lang)
                         else:
-                            raise ValueError("lang required in manifest when using aggregate tokenizers")
+                            raise ValueError(
+                                "lang required in manifest when using aggregate tokenizers")
                     else:
                         text_tokens = parser(text)
                 else:
@@ -1346,7 +1553,8 @@ def __init__(
             total_duration += duration
 
             data.append(
-                output_type(id_, feat_file, rttm_file, duration, text_tokens, offset, text, speaker, orig_sr, lang)
+                output_type(id_, feat_file, rttm_file, duration,
+                            text_tokens, offset, text, speaker, orig_sr, lang)
             )
             if index_by_file_id:
                 file_id, _ = os.path.splitext(os.path.basename(feat_file))
@@ -1360,12 +1568,15 @@ def __init__(
 
         if do_sort_by_duration:
             if index_by_file_id:
-                logging.warning("Tried to sort dataset by duration, but cannot since index_by_file_id is set.")
+                logging.warning(
+                    "Tried to sort dataset by duration, but cannot since index_by_file_id is set.")
             else:
                 data.sort(key=lambda entity: entity.duration)
 
-        logging.info("Dataset loaded with %d files totalling %.2f hours", len(data), total_duration / 3600)
-        logging.info("%d files were filtered totalling %.2f hours", num_filtered, duration_filtered / 3600)
+        logging.info("Dataset loaded with %d files totalling %.2f hours", len(
+            data), total_duration / 3600)
+        logging.info("%d files were filtered totalling %.2f hours",
+                     num_filtered, duration_filtered / 3600)
 
         super().__init__(data)
 
diff --git a/scripts/tokenizers/sentencepiece_model_pb2.py b/scripts/tokenizers/sentencepiece_model_pb2.py
new file mode 100644
index 000000000000..bd196c531a2c
--- /dev/null
+++ b/scripts/tokenizers/sentencepiece_model_pb2.py
@@ -0,0 +1,764 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: sentencepiece_model.proto
+
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='sentencepiece_model.proto',
+  package='sentencepiece',
+  syntax='proto2',
+  serialized_options=b'H\003',
+  create_key=_descriptor._internal_create_key,
+  serialized_pb=b'\n\x19sentencepiece_model.proto\x12\rsentencepiece\"\xa4\x0c\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12*\n\x1b\x65nable_differential_privacy\x18\x32 \x01(\x08:\x05\x66\x61lse\x12+\n differential_privacy_noise_level\x18\x33 \x01(\x02:\x01\x30\x12\x32\n\'differential_privacy_clipping_threshold\x18\x34 \x01(\x04:\x01\x30\x12\"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12\"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12+\n\x1c\x61llow_whitespace_only_pieces\x18\x1a \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12#\n\x19pretokenization_delimiter\x18\x35 \x01(\t:\x00\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18  \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18\" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse\x12\"\n\x18seed_sentencepieces_file\x18\x36 \x01(\t:\x00\"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32\".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL\"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03'
+)
+
+
+
+_TRAINERSPEC_MODELTYPE = _descriptor.EnumDescriptor(
+  name='ModelType',
+  full_name='sentencepiece.TrainerSpec.ModelType',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='UNIGRAM', index=0, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='BPE', index=1, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='WORD', index=2, number=3,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CHAR', index=3, number=4,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=1553,
+  serialized_end=1606,
+)
+_sym_db.RegisterEnumDescriptor(_TRAINERSPEC_MODELTYPE)
+
+_MODELPROTO_SENTENCEPIECE_TYPE = _descriptor.EnumDescriptor(
+  name='Type',
+  full_name='sentencepiece.ModelProto.SentencePiece.Type',
+  filename=None,
+  file=DESCRIPTOR,
+  create_key=_descriptor._internal_create_key,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='NORMAL', index=0, number=1,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='UNKNOWN', index=1, number=2,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='CONTROL', index=2, number=3,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='USER_DEFINED', index=3, number=4,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='BYTE', index=4, number=6,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+    _descriptor.EnumValueDescriptor(
+      name='UNUSED', index=5, number=5,
+      serialized_options=None,
+      type=None,
+      create_key=_descriptor._internal_create_key),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=2359,
+  serialized_end=2443,
+)
+_sym_db.RegisterEnumDescriptor(_MODELPROTO_SENTENCEPIECE_TYPE)
+
+
+_TRAINERSPEC = _descriptor.Descriptor(
+  name='TrainerSpec',
+  full_name='sentencepiece.TrainerSpec',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='input', full_name='sentencepiece.TrainerSpec.input', index=0,
+      number=1, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='input_format', full_name='sentencepiece.TrainerSpec.input_format', index=1,
+      number=7, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='model_prefix', full_name='sentencepiece.TrainerSpec.model_prefix', index=2,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='model_type', full_name='sentencepiece.TrainerSpec.model_type', index=3,
+      number=3, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='vocab_size', full_name='sentencepiece.TrainerSpec.vocab_size', index=4,
+      number=4, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=8000,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='accept_language', full_name='sentencepiece.TrainerSpec.accept_language', index=5,
+      number=5, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='self_test_sample_size', full_name='sentencepiece.TrainerSpec.self_test_sample_size', index=6,
+      number=6, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='enable_differential_privacy', full_name='sentencepiece.TrainerSpec.enable_differential_privacy', index=7,
+      number=50, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='differential_privacy_noise_level', full_name='sentencepiece.TrainerSpec.differential_privacy_noise_level', index=8,
+      number=51, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='differential_privacy_clipping_threshold', full_name='sentencepiece.TrainerSpec.differential_privacy_clipping_threshold', index=9,
+      number=52, type=4, cpp_type=4, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='character_coverage', full_name='sentencepiece.TrainerSpec.character_coverage', index=10,
+      number=10, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0.9995),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='input_sentence_size', full_name='sentencepiece.TrainerSpec.input_sentence_size', index=11,
+      number=11, type=4, cpp_type=4, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='shuffle_input_sentence', full_name='sentencepiece.TrainerSpec.shuffle_input_sentence', index=12,
+      number=19, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='mining_sentence_size', full_name='sentencepiece.TrainerSpec.mining_sentence_size', index=13,
+      number=12, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=b'\030\001', file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='training_sentence_size', full_name='sentencepiece.TrainerSpec.training_sentence_size', index=14,
+      number=13, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=b'\030\001', file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='seed_sentencepiece_size', full_name='sentencepiece.TrainerSpec.seed_sentencepiece_size', index=15,
+      number=14, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=1000000,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='shrinking_factor', full_name='sentencepiece.TrainerSpec.shrinking_factor', index=16,
+      number=15, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0.75),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='max_sentence_length', full_name='sentencepiece.TrainerSpec.max_sentence_length', index=17,
+      number=18, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=4192,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='num_threads', full_name='sentencepiece.TrainerSpec.num_threads', index=18,
+      number=16, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=16,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='num_sub_iterations', full_name='sentencepiece.TrainerSpec.num_sub_iterations', index=19,
+      number=17, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=2,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='max_sentencepiece_length', full_name='sentencepiece.TrainerSpec.max_sentencepiece_length', index=20,
+      number=20, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=16,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='split_by_unicode_script', full_name='sentencepiece.TrainerSpec.split_by_unicode_script', index=21,
+      number=21, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='split_by_number', full_name='sentencepiece.TrainerSpec.split_by_number', index=22,
+      number=23, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='split_by_whitespace', full_name='sentencepiece.TrainerSpec.split_by_whitespace', index=23,
+      number=22, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='treat_whitespace_as_suffix', full_name='sentencepiece.TrainerSpec.treat_whitespace_as_suffix', index=24,
+      number=24, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='allow_whitespace_only_pieces', full_name='sentencepiece.TrainerSpec.allow_whitespace_only_pieces', index=25,
+      number=26, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='split_digits', full_name='sentencepiece.TrainerSpec.split_digits', index=26,
+      number=25, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='pretokenization_delimiter', full_name='sentencepiece.TrainerSpec.pretokenization_delimiter', index=27,
+      number=53, type=9, cpp_type=9, label=1,
+      has_default_value=True, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='control_symbols', full_name='sentencepiece.TrainerSpec.control_symbols', index=28,
+      number=30, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='user_defined_symbols', full_name='sentencepiece.TrainerSpec.user_defined_symbols', index=29,
+      number=31, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='required_chars', full_name='sentencepiece.TrainerSpec.required_chars', index=30,
+      number=36, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='byte_fallback', full_name='sentencepiece.TrainerSpec.byte_fallback', index=31,
+      number=35, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='vocabulary_output_piece_score', full_name='sentencepiece.TrainerSpec.vocabulary_output_piece_score', index=32,
+      number=32, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='hard_vocab_limit', full_name='sentencepiece.TrainerSpec.hard_vocab_limit', index=33,
+      number=33, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='use_all_vocab', full_name='sentencepiece.TrainerSpec.use_all_vocab', index=34,
+      number=34, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='unk_id', full_name='sentencepiece.TrainerSpec.unk_id', index=35,
+      number=40, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='bos_id', full_name='sentencepiece.TrainerSpec.bos_id', index=36,
+      number=41, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='eos_id', full_name='sentencepiece.TrainerSpec.eos_id', index=37,
+      number=42, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=2,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='pad_id', full_name='sentencepiece.TrainerSpec.pad_id', index=38,
+      number=43, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=-1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='unk_piece', full_name='sentencepiece.TrainerSpec.unk_piece', index=39,
+      number=45, type=9, cpp_type=9, label=1,
+      has_default_value=True, default_value=b"<unk>".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='bos_piece', full_name='sentencepiece.TrainerSpec.bos_piece', index=40,
+      number=46, type=9, cpp_type=9, label=1,
+      has_default_value=True, default_value=b"<s>".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='eos_piece', full_name='sentencepiece.TrainerSpec.eos_piece', index=41,
+      number=47, type=9, cpp_type=9, label=1,
+      has_default_value=True, default_value=b"</s>".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='pad_piece', full_name='sentencepiece.TrainerSpec.pad_piece', index=42,
+      number=48, type=9, cpp_type=9, label=1,
+      has_default_value=True, default_value=b"<pad>".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='unk_surface', full_name='sentencepiece.TrainerSpec.unk_surface', index=43,
+      number=44, type=9, cpp_type=9, label=1,
+      has_default_value=True, default_value=b" \342\201\207 ".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='train_extremely_large_corpus', full_name='sentencepiece.TrainerSpec.train_extremely_large_corpus', index=44,
+      number=49, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='seed_sentencepieces_file', full_name='sentencepiece.TrainerSpec.seed_sentencepieces_file', index=45,
+      number=54, type=9, cpp_type=9, label=1,
+      has_default_value=True, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _TRAINERSPEC_MODELTYPE,
+  ],
+  serialized_options=None,
+  is_extendable=True,
+  syntax='proto2',
+  extension_ranges=[(200, 536870912), ],
+  oneofs=[
+  ],
+  serialized_start=45,
+  serialized_end=1617,
+)
+
+
+_NORMALIZERSPEC = _descriptor.Descriptor(
+  name='NormalizerSpec',
+  full_name='sentencepiece.NormalizerSpec',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='sentencepiece.NormalizerSpec.name', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='precompiled_charsmap', full_name='sentencepiece.NormalizerSpec.precompiled_charsmap', index=1,
+      number=2, type=12, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"",
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='add_dummy_prefix', full_name='sentencepiece.NormalizerSpec.add_dummy_prefix', index=2,
+      number=3, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='remove_extra_whitespaces', full_name='sentencepiece.NormalizerSpec.remove_extra_whitespaces', index=3,
+      number=4, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='escape_whitespaces', full_name='sentencepiece.NormalizerSpec.escape_whitespaces', index=4,
+      number=5, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='normalization_rule_tsv', full_name='sentencepiece.NormalizerSpec.normalization_rule_tsv', index=5,
+      number=6, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=True,
+  syntax='proto2',
+  extension_ranges=[(200, 536870912), ],
+  oneofs=[
+  ],
+  serialized_start=1620,
+  serialized_end=1829,
+)
+
+
+_SELFTESTDATA_SAMPLE = _descriptor.Descriptor(
+  name='Sample',
+  full_name='sentencepiece.SelfTestData.Sample',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='input', full_name='sentencepiece.SelfTestData.Sample.input', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='expected', full_name='sentencepiece.SelfTestData.Sample.expected', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1900,
+  serialized_end=1941,
+)
+
+_SELFTESTDATA = _descriptor.Descriptor(
+  name='SelfTestData',
+  full_name='sentencepiece.SelfTestData',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='samples', full_name='sentencepiece.SelfTestData.samples', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[_SELFTESTDATA_SAMPLE, ],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=True,
+  syntax='proto2',
+  extension_ranges=[(200, 536870912), ],
+  oneofs=[
+  ],
+  serialized_start=1831,
+  serialized_end=1952,
+)
+
+
+_MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor(
+  name='SentencePiece',
+  full_name='sentencepiece.ModelProto.SentencePiece',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='piece', full_name='sentencepiece.ModelProto.SentencePiece.piece', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='score', full_name='sentencepiece.ModelProto.SentencePiece.score', index=1,
+      number=2, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='type', full_name='sentencepiece.ModelProto.SentencePiece.type', index=2,
+      number=3, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _MODELPROTO_SENTENCEPIECE_TYPE,
+  ],
+  serialized_options=None,
+  is_extendable=True,
+  syntax='proto2',
+  extension_ranges=[(200, 536870912), ],
+  oneofs=[
+  ],
+  serialized_start=2244,
+  serialized_end=2454,
+)
+
+_MODELPROTO = _descriptor.Descriptor(
+  name='ModelProto',
+  full_name='sentencepiece.ModelProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  create_key=_descriptor._internal_create_key,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='pieces', full_name='sentencepiece.ModelProto.pieces', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='trainer_spec', full_name='sentencepiece.ModelProto.trainer_spec', index=1,
+      number=2, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='normalizer_spec', full_name='sentencepiece.ModelProto.normalizer_spec', index=2,
+      number=3, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='self_test_data', full_name='sentencepiece.ModelProto.self_test_data', index=3,
+      number=4, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+    _descriptor.FieldDescriptor(
+      name='denormalizer_spec', full_name='sentencepiece.ModelProto.denormalizer_spec', index=4,
+      number=5, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
+  ],
+  extensions=[
+  ],
+  nested_types=[_MODELPROTO_SENTENCEPIECE, ],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=True,
+  syntax='proto2',
+  extension_ranges=[(200, 536870912), ],
+  oneofs=[
+  ],
+  serialized_start=1955,
+  serialized_end=2465,
+)
+
+_TRAINERSPEC.fields_by_name['model_type'].enum_type = _TRAINERSPEC_MODELTYPE
+_TRAINERSPEC_MODELTYPE.containing_type = _TRAINERSPEC
+_SELFTESTDATA_SAMPLE.containing_type = _SELFTESTDATA
+_SELFTESTDATA.fields_by_name['samples'].message_type = _SELFTESTDATA_SAMPLE
+_MODELPROTO_SENTENCEPIECE.fields_by_name['type'].enum_type = _MODELPROTO_SENTENCEPIECE_TYPE
+_MODELPROTO_SENTENCEPIECE.containing_type = _MODELPROTO
+_MODELPROTO_SENTENCEPIECE_TYPE.containing_type = _MODELPROTO_SENTENCEPIECE
+_MODELPROTO.fields_by_name['pieces'].message_type = _MODELPROTO_SENTENCEPIECE
+_MODELPROTO.fields_by_name['trainer_spec'].message_type = _TRAINERSPEC
+_MODELPROTO.fields_by_name['normalizer_spec'].message_type = _NORMALIZERSPEC
+_MODELPROTO.fields_by_name['self_test_data'].message_type = _SELFTESTDATA
+_MODELPROTO.fields_by_name['denormalizer_spec'].message_type = _NORMALIZERSPEC
+DESCRIPTOR.message_types_by_name['TrainerSpec'] = _TRAINERSPEC
+DESCRIPTOR.message_types_by_name['NormalizerSpec'] = _NORMALIZERSPEC
+DESCRIPTOR.message_types_by_name['SelfTestData'] = _SELFTESTDATA
+DESCRIPTOR.message_types_by_name['ModelProto'] = _MODELPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+TrainerSpec = _reflection.GeneratedProtocolMessageType('TrainerSpec', (_message.Message,), {
+  'DESCRIPTOR' : _TRAINERSPEC,
+  '__module__' : 'sentencepiece_model_pb2'
+  # @@protoc_insertion_point(class_scope:sentencepiece.TrainerSpec)
+  })
+_sym_db.RegisterMessage(TrainerSpec)
+
+NormalizerSpec = _reflection.GeneratedProtocolMessageType('NormalizerSpec', (_message.Message,), {
+  'DESCRIPTOR' : _NORMALIZERSPEC,
+  '__module__' : 'sentencepiece_model_pb2'
+  # @@protoc_insertion_point(class_scope:sentencepiece.NormalizerSpec)
+  })
+_sym_db.RegisterMessage(NormalizerSpec)
+
+SelfTestData = _reflection.GeneratedProtocolMessageType('SelfTestData', (_message.Message,), {
+
+  'Sample' : _reflection.GeneratedProtocolMessageType('Sample', (_message.Message,), {
+    'DESCRIPTOR' : _SELFTESTDATA_SAMPLE,
+    '__module__' : 'sentencepiece_model_pb2'
+    # @@protoc_insertion_point(class_scope:sentencepiece.SelfTestData.Sample)
+    })
+  ,
+  'DESCRIPTOR' : _SELFTESTDATA,
+  '__module__' : 'sentencepiece_model_pb2'
+  # @@protoc_insertion_point(class_scope:sentencepiece.SelfTestData)
+  })
+_sym_db.RegisterMessage(SelfTestData)
+_sym_db.RegisterMessage(SelfTestData.Sample)
+
+ModelProto = _reflection.GeneratedProtocolMessageType('ModelProto', (_message.Message,), {
+
+  'SentencePiece' : _reflection.GeneratedProtocolMessageType('SentencePiece', (_message.Message,), {
+    'DESCRIPTOR' : _MODELPROTO_SENTENCEPIECE,
+    '__module__' : 'sentencepiece_model_pb2'
+    # @@protoc_insertion_point(class_scope:sentencepiece.ModelProto.SentencePiece)
+    })
+  ,
+  'DESCRIPTOR' : _MODELPROTO,
+  '__module__' : 'sentencepiece_model_pb2'
+  # @@protoc_insertion_point(class_scope:sentencepiece.ModelProto)
+  })
+_sym_db.RegisterMessage(ModelProto)
+_sym_db.RegisterMessage(ModelProto.SentencePiece)
+
+
+DESCRIPTOR._options = None
+_TRAINERSPEC.fields_by_name['mining_sentence_size']._options = None
+_TRAINERSPEC.fields_by_name['training_sentence_size']._options = None
+# @@protoc_insertion_point(module_scope)
diff --git a/tools/nemo_forced_aligner/align.py b/tools/nemo_forced_aligner/align.py
index d298e8072d58..3f795606b4b7 100644
--- a/tools/nemo_forced_aligner/align.py
+++ b/tools/nemo_forced_aligner/align.py
@@ -148,7 +148,7 @@ class AlignmentConfig:
     simulate_cache_aware_streaming: Optional[bool] = False
 
     # Output file configs
-    save_output_file_formats: List[str] = field(default_factory=lambda: ["ctm", "ass"])
+    save_output_file_formats: List[str] = field(default_factory=lambda: ["ctm"])
     ctm_file_config: CTMFileConfig = field(default_factory=lambda: CTMFileConfig())
     ass_file_config: ASSFileConfig = field(default_factory=lambda: ASSFileConfig())
 
diff --git a/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb b/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb
index 94e2caa17a58..cdc0afdf5a81 100644
--- a/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb
+++ b/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb
@@ -1,22 +1,12 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "accelerator": "GPU"
-  },
   "cells": [
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "EGV_ioUHqhun"
       },
+      "outputs": [],
       "source": [
         "\"\"\"\n",
         "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
@@ -48,9 +38,7 @@
         "that you want to use the \"Run All Cells\" (or similar) option.\n",
         "\"\"\"\n",
         "# exit()"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -75,9 +63,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 2,
       "metadata": {
         "id": "1cjMaek4rY8-"
       },
+      "outputs": [],
       "source": [
         "import os\n",
         "import glob\n",
@@ -86,15 +76,15 @@
         "import wget\n",
         "import copy\n",
         "from omegaconf import OmegaConf, open_dict"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 3,
       "metadata": {
         "id": "8wqTRjpNruZD"
       },
+      "outputs": [],
       "source": [
         "data_dir = 'datasets/'\n",
         "\n",
@@ -103,23 +93,21 @@
         "\n",
         "if not os.path.exists(\"scripts\"):\n",
         "  os.makedirs(\"scripts\")"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 4,
       "metadata": {
         "id": "TSTb6b5DriWG"
       },
+      "outputs": [],
       "source": [
         "import nemo\n",
         "import nemo.collections.asr as nemo_asr\n",
         "from nemo.collections.asr.metrics.wer import word_error_rate\n",
         "from nemo.utils import logging, exp_manager"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -138,6 +126,9 @@
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "k40Q84TNnU6O"
+      },
       "source": [
         "## Hugging Face\n",
         "\n",
@@ -156,59 +147,71 @@
         "Code steps:\n",
         "- Now below, run `login()`\n",
         "- Paste your preserved HF TOKEN API KEY to the text box.\""
-      ],
-      "metadata": {
-        "id": "k40Q84TNnU6O"
-      }
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 5,
       "metadata": {
         "id": "27h1i8qa7WFE"
       },
+      "outputs": [
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "b900b618b438414dacc6cd9c7928a8ef",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        }
+      ],
       "source": [
         "from huggingface_hub import login\n",
         "login()"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 6,
       "metadata": {
         "id": "x0i8hvt688hc"
       },
+      "outputs": [],
       "source": [
         "VERSION = \"mozilla-foundation/common_voice_6_1\"\n",
         "LANGUAGE = \"ja\""
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 7,
       "metadata": {
         "id": "-wI16qY_misb"
       },
+      "outputs": [],
       "source": [
         "tokenizer_dir = os.path.join('tokenizers', LANGUAGE)\n",
         "manifest_dir = os.path.join('datasets', LANGUAGE, VERSION, LANGUAGE)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 8,
       "metadata": {
         "id": "bvOT_La2NNw1"
       },
+      "outputs": [],
       "source": [
         "# If something goes wrong during data processing, un-comment the following line to delete the cached dataset\n",
         "# !rm -rf datasets/$LANGUAGE\n",
         "!mkdir -p datasets"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -223,22 +226,110 @@
     },
     {
       "cell_type": "code",
-      "source": [
-        "if not os.path.exists(\"convert_hf_dataset_to_nemo.py\"):\n",
-        "    !wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/speech_recognition/convert_hf_dataset_to_nemo.py\n",
-        ""
-      ],
+      "execution_count": 9,
       "metadata": {
         "id": "Q2NbhCNBoHdq"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "--2024-06-26 19:36:46--  https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/speech_recognition/convert_hf_dataset_to_nemo.py\n",
+            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...\n",
+            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
+            "HTTP request sent, awaiting response... 200 OK\n",
+            "Length: 14735 (14K) [text/plain]\n",
+            "Saving to: ‘convert_hf_dataset_to_nemo.py’\n",
+            "\n",
+            "convert_hf_dataset_ 100%[===================>]  14.39K  --.-KB/s    in 0s      \n",
+            "\n",
+            "2024-06-26 19:36:46 (213 MB/s) - ‘convert_hf_dataset_to_nemo.py’ saved [14735/14735]\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "if not os.path.exists(\"convert_hf_dataset_to_nemo.py\"):\n",
+        "    !wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/speech_recognition/convert_hf_dataset_to_nemo.py\n"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 11,
       "metadata": {
         "id": "Inwx4OE97guu"
       },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "/workspace/nemo/NeMo-opensource/tutorials/asr/convert_hf_dataset_to_nemo.py:346: UserWarning: \n",
+            "The version_base parameter is not specified.\n",
+            "Please specify a compatability version level, or None.\n",
+            "Will assume defaults for version 1.1\n",
+            "  @hydra.main(config_name='hfds_config', config_path=None)\n",
+            "/usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py:119: UserWarning: Future Hydra versions will no longer change working directory at job runtime by default.\n",
+            "See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.\n",
+            "  ret = run_job(\n",
+            "/usr/local/lib/python3.10/dist-packages/datasets/load.py:1454: FutureWarning: The repository for mozilla-foundation/common_voice_6_1 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/mozilla-foundation/common_voice_6_1\n",
+            "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
+            "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
+            "  warnings.warn(\n",
+            "Downloading builder script: 100%|██████████| 11.5k/11.5k [00:00<00:00, 45.5MB/s]\n",
+            "Downloading readme: 100%|██████████████████| 10.8k/10.8k [00:00<00:00, 1.65MB/s]\n",
+            "Downloading extra modules: 100%|███████████| 3.29k/3.29k [00:00<00:00, 35.1MB/s]\n",
+            "Downloading extra modules: 100%|████████████| 39.9k/39.9k [00:00<00:00, 325kB/s]\n",
+            "Downloading data: 100%|██████████████████████| 153M/153M [00:05<00:00, 27.9MB/s]\n",
+            "Generating train split: 722 examples [00:01, 665.83 examples/s] \n",
+            "Generating test split: 632 examples [00:01, 583.76 examples/s]\n",
+            "Generating validation split: 586 examples [00:01, 545.90 examples/s]\n",
+            "Generating other split: 885 examples [00:01, 813.03 examples/s] \n",
+            "Generating invalidated split: 504 examples [00:01, 472.40 examples/s] \n",
+            "Single split found for dataset mozilla-foundation/common_voice_6_1 | Split chosen = train\n",
+            "Map (num_proc=8): 100%|████████████████| 722/722 [00:07<00:00, 91.68 examples/s]\n",
+            "Processing mozilla-foundation/common_voice_6_1 (split : train):: 100%|█| 722/722\n",
+            "\n",
+            "Dataset conversion finished !\n",
+            "\u001b[0m/workspace/nemo/NeMo-opensource/tutorials/asr/convert_hf_dataset_to_nemo.py:346: UserWarning: \n",
+            "The version_base parameter is not specified.\n",
+            "Please specify a compatability version level, or None.\n",
+            "Will assume defaults for version 1.1\n",
+            "  @hydra.main(config_name='hfds_config', config_path=None)\n",
+            "/usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py:119: UserWarning: Future Hydra versions will no longer change working directory at job runtime by default.\n",
+            "See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.\n",
+            "  ret = run_job(\n",
+            "/usr/local/lib/python3.10/dist-packages/datasets/load.py:1454: FutureWarning: The repository for mozilla-foundation/common_voice_6_1 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/mozilla-foundation/common_voice_6_1\n",
+            "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
+            "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
+            "  warnings.warn(\n",
+            "Single split found for dataset mozilla-foundation/common_voice_6_1 | Split chosen = validation\n",
+            "Map (num_proc=8): 100%|████████████████| 586/586 [00:06<00:00, 85.82 examples/s]\n",
+            "Processing mozilla-foundation/common_voice_6_1 (split : validation):: 100%|█| 58\n",
+            "\n",
+            "Dataset conversion finished !\n",
+            "\u001b[0m/workspace/nemo/NeMo-opensource/tutorials/asr/convert_hf_dataset_to_nemo.py:346: UserWarning: \n",
+            "The version_base parameter is not specified.\n",
+            "Please specify a compatability version level, or None.\n",
+            "Will assume defaults for version 1.1\n",
+            "  @hydra.main(config_name='hfds_config', config_path=None)\n",
+            "/usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py:119: UserWarning: Future Hydra versions will no longer change working directory at job runtime by default.\n",
+            "See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.\n",
+            "  ret = run_job(\n",
+            "/usr/local/lib/python3.10/dist-packages/datasets/load.py:1454: FutureWarning: The repository for mozilla-foundation/common_voice_6_1 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/mozilla-foundation/common_voice_6_1\n",
+            "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
+            "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
+            "  warnings.warn(\n",
+            "Single split found for dataset mozilla-foundation/common_voice_6_1 | Split chosen = test\n",
+            "Map (num_proc=8): 100%|████████████████| 632/632 [00:08<00:00, 74.60 examples/s]\n",
+            "Processing mozilla-foundation/common_voice_6_1 (split : test):: 100%|█| 632/632 \n",
+            "\n",
+            "Dataset conversion finished !\n",
+            "\u001b[0m"
+          ]
+        }
+      ],
       "source": [
         "# !python convert_hf_dataset_to_nemo.py \\\n",
         "#   --data_root \"datasets/$LANGUAGE/\" \\\n",
@@ -272,9 +363,7 @@
         "    split=\"test\" \\\n",
         "    ensure_ascii=False \\\n",
         "    use_auth_token=True"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -287,16 +376,36 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 12,
       "metadata": {
         "id": "j7WAGLX59C26"
       },
+      "outputs": [],
       "source": [
         "train_manifest = f\"{manifest_dir}/train/train_mozilla-foundation_common_voice_6_1_manifest.json\"\n",
         "dev_manifest = f\"{manifest_dir}/validation/validation_mozilla-foundation_common_voice_6_1_manifest.json\"\n",
         "test_manifest = f\"{manifest_dir}/test/test_mozilla-foundation_common_voice_6_1_manifest.json\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "'datasets/ja/mozilla-foundation/common_voice_6_1/ja'"
+            ]
+          },
+          "execution_count": 13,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
       ],
-      "execution_count": null,
-      "outputs": []
+      "source": [
+        "manifest_dir"
+      ]
     },
     {
       "cell_type": "markdown",
@@ -326,9 +435,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 14,
       "metadata": {
         "id": "EdkJYxUirp7C"
       },
+      "outputs": [],
       "source": [
         "# Manifest Utils\n",
         "from tqdm.auto import tqdm\n",
@@ -345,22 +456,20 @@
         "    write_manifest(filepath, data)\n",
         "    print(f\"Finished writing manifest: {filepath}\")\n",
         "    return filepath"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 15,
       "metadata": {
         "id": "HngfzcwOijy4"
       },
+      "outputs": [],
       "source": [
         "train_manifest_data = read_manifest(train_manifest)\n",
         "dev_manifest_data = read_manifest(dev_manifest)\n",
         "test_manifest_data = read_manifest(test_manifest)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -373,16 +482,16 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 16,
       "metadata": {
         "id": "T2iwnvhXimfG"
       },
+      "outputs": [],
       "source": [
         "train_text = [data['text'] for data in train_manifest_data]\n",
         "dev_text = [data['text'] for data in dev_manifest_data]\n",
         "test_text = [data['text'] for data in test_manifest_data]"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -397,9 +506,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 17,
       "metadata": {
         "id": "XpUb_pI5imhh"
       },
+      "outputs": [],
       "source": [
         "from collections import defaultdict\n",
         "\n",
@@ -410,22 +521,63 @@
         "        for character in text:\n",
         "            charset[character] += 1\n",
         "    return charset"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 18,
       "metadata": {
         "id": "obcPlrOJimju"
       },
+      "outputs": [
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "7a31ac5a54444b02b1bef6148a6fbf1d",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Computing character set:   0%|          | 0/722 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "6c8c313765bf4cdc8510cb3ff7f8374e",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Computing character set:   0%|          | 0/586 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "41e7c23a68304e76a60de007938a9977",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Computing character set:   0%|          | 0/632 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        }
+      ],
       "source": [
         "train_charset = get_charset(train_manifest_data)\n",
         "dev_charset = get_charset(dev_manifest_data)\n",
         "test_charset = get_charset(test_manifest_data)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -438,27 +590,36 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 19,
       "metadata": {
         "id": "Z8QVdph6imlz"
       },
+      "outputs": [],
       "source": [
         "train_dev_set = set.union(set(train_charset.keys()), set(dev_charset.keys()))\n",
         "test_set = set(test_charset.keys())"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 20,
       "metadata": {
         "id": "NgCfETWNimn3"
       },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Number of tokens in train+dev set : 1255\n",
+            "Number of tokens in test set : 1059\n"
+          ]
+        }
+      ],
       "source": [
         "print(f\"Number of tokens in train+dev set : {len(train_dev_set)}\")\n",
         "print(f\"Number of tokens in test set : {len(test_set)}\")"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -499,9 +660,21 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 21,
       "metadata": {
         "id": "KPrBi35Cimqc"
       },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Number of OOV tokens in test set : 178\n",
+            "\n",
+            "{'照', '顧', '筋', '瀬', '懐', '釣', '旋', '昔', '孫', '因', '負', '餐', 'Ｄ', '遭', '硬', '奏', '具', '景', '印', '翻', '弊', '躇', '襲', '命', '床', '叩', '皆', '珍', '却', '縁', '艇', '浅', '震', '粉', '完', '紫', '洒', '盛', '追', '層', '繕', '殖', '〇', '捨', '塵', '淵', '穴', '帝', '署', '繋', '肥', '伝', '協', '旧', '触', 'Ｇ', '謙', '溢', '郊', '劣', '柱', '劇', '効', '償', '諸', '賄', 'Ｐ', '丸', '否', '南', '浸', '歓', '岩', '卒', '郡', '届', '碑', '滅', '灯', '犠', '純', '牲', '像', '胴', '扉', '瞬', '裕', '君', '退', '躊', '翌', '綴', '器', '偉', '領', '採', '訪', '接', '鍵', '県', '占', '眺', '掌', '岸', '冠', '懸', '吉', '袋', '髣', '級', '祖', '温', '区', '殿', '寵', '往', '遇', '乾', '躍', '璧', '税', '肘', '渠', '噂', '幅', '刑', '承', '融', '草', '壮', '挙', '冊', '森', '士', '苑', '忠', '異', '識', '希', '衣', '概', '砲', '打', '垢', '纏', '憶', '処', '鞄', '宴', '可', '騒', '僕', '麗', '嶋', '棋', '駐', '拍', '髴', '塁', '抑', '称', '彩', '示', '殻', '件', '極', '賂', '罅', '擬', '獅', '闘', 'ぷ', '既', '剥', '茂', '湯', '獲', '？'}\n"
+          ]
+        }
+      ],
       "source": [
         "# OOV tokens in test set\n",
         "train_test_common = set.intersection(train_dev_set, test_set)\n",
@@ -509,9 +682,7 @@
         "print(f\"Number of OOV tokens in test set : {len(test_oov)}\")\n",
         "print()\n",
         "print(test_oov)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -539,9 +710,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 22,
       "metadata": {
         "id": "VDDiXCiPimr_"
       },
+      "outputs": [],
       "source": [
         "# Populate dictionary mapping count: list[tokens]\n",
         "train_counts = defaultdict(list)\n",
@@ -552,9 +725,7 @@
         "\n",
         "# Compute sorter order of the count keys\n",
         "count_keys = sorted(list(train_counts.keys()))"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -567,9 +738,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 23,
       "metadata": {
         "id": "TJeVEKvAimwE"
       },
+      "outputs": [],
       "source": [
         "MAX_COUNT = 32\n",
         "\n",
@@ -581,9 +754,7 @@
         "\n",
         "        TOKEN_COUNT_X.append(count)\n",
         "        NUM_TOKENS_Y.append(num_tokens)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -596,9 +767,22 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 24,
       "metadata": {
         "id": "rKULANgINqbq"
       },
+      "outputs": [
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAHHCAYAAABZbpmkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABPoElEQVR4nO3de1zO9/8/8McVXVfH60p0dKjEUJTJoeaQTYrFHDJmRg6fMYo5hTZz2oGxjdmY2WfDZ3MYho055XyK5ZBDSFqEVE51qXTQ9fr94df761LRRbny9rjfbu/brd6v1/v9fr4P1aP36VIIIQSIiIiIZMrE2AUQERERVSSGHSIiIpI1hh0iIiKSNYYdIiIikjWGHSIiIpI1hh0iIiKSNYYdIiIikjWGHSIiIpI1hh0iIiKSNYYdohdcQkICAgMDodFooFAosGHDhue27EuXLkGhUGDp0qXPbZnGMG3aNCgUCty8edPYpZRJUb0vioEDB8LV1dXYZQAA2rdvj/bt2xu7DCpnDDsvkbi4OLz33nuoWbMmVCoVnJ2d0a9fP8TFxRm7NHoGoaGhOH36ND7//HP8+uuvaN68ubFLqlRycnIwbdo07Nmzx9ilvLBSUlIwbdo0xMbGGrsUKoOFCxfK/h8QQ1U1dgH0fKxbtw59+/aFra0thgwZAjc3N1y6dAk///wz1q5di1WrVqFHjx7GLpMMdO/ePURHR+Pjjz9GeHj4c1++i4sL7t27B1NT0+e+7LLKycnB9OnTAeCl+Y998uTJmDRpUrnNLyUlBdOnT4erqyuaNm1abvMt8tNPP0Gn05X7fF9WCxcuRI0aNTBw4EBjl1JpMOy8BBITE9G/f3/UrVsX+/btg52dndT24Ycfom3btujfvz9OnTqFunXrGrHS4nJycmBhYVFs/P3796HT6aBUKo1QVeVx48YNAICNjY1Rlq9QKGBmZmaUZVPpqlatiqpVjffrvbSf29KUd1h2dXXFwIEDMW3atHKdL724eBnrJTBnzhzk5ORg8eLFekEHAGrUqIEff/wR2dnZmD17tl7btWvXMGTIEDg7O0OlUsHNzQ3Dhw9Hfn6+1CcjIwNjxoyBq6srVCoVatWqhQEDBkj3NixduhQKhQKXLl3Sm/eePXugUCj0Li20b98ejRs3xrFjx9CuXTtYWFjgo48+ku4L+eqrrzBv3jy4u7tDpVLh7NmzAIDz58+jV69esLW1hZmZGZo3b46//vpLb3lFdRw8eBBjx46FnZ0dLC0t0aNHDykwPGzLli3w9/eHtbU11Go1WrRogRUrVuj1OXLkCDp16gSNRgMLCwv4+/vj4MGDen3u3r2L0aNHS9vH3t4eHTt2xPHjxx+zxx44ceIEOnfuDLVaDSsrK3To0AGHDx+W2qdNmwYXFxcAQEREBBQKxWPve3iafXH27Fm8/vrrsLCwQM2aNYsdI6Xds7NhwwY0btwYZmZmaNy4MdavX1/svoySlvu4eZZlPz/q0qVL0jE/ffp0KBQKKBQKvT+Cu3btQtu2bWFpaQkbGxt069YN586de+x8AeDy5cuoV68eGjdujLS0NAAPfh5Gjx6N2rVrQ6VSoV69evjyyy/1zlo8fDwvXrxYOp5btGiBmJgYvWWkpqZi0KBBqFWrFlQqFZycnNCtW7di+/BRJd2zo1AoEB4eLu0blUoFT09PbN269bHz2rNnD1q0aAEAGDRokLQNi/ZPaT+3APDnn38iODhY+h3i7u6OTz/9FIWFhXrLePTYMGQbPYuieZubm6Nly5bYv39/if3y8vIwdepU1KtXDyqVCrVr18aECROQl5cn9WncuDFef/31YtPqdDrUrFkTvXr1emwtR48eRVBQEGrUqAFzc3O4ublh8ODBxeY1b948eHp6wszMDA4ODhg2bBju3Lkj9XF1dUVcXBz27t0r7auX5Yzm4/DMzktg48aNcHV1Rdu2bUtsb9euHVxdXfH3339L41JSUtCyZUtkZGRg6NChaNiwIa5du4a1a9ciJycHSqUSWVlZaNu2Lc6dO4fBgwejWbNmuHnzJv766y9cvXoVNWrUMLjWW7duoXPnznjnnXfw3nvvwcHBQWpbsmQJcnNzMXToUKhUKtja2iIuLg6tW7dGzZo1MWnSJFhaWmL16tXo3r07/vjjj2KX5kaOHIlq1aph6tSpuHTpEubNm4fw8HD8/vvvUp+lS5di8ODB8PT0RGRkJGxsbHDixAls3boV7777LoAHfyA7d+4MHx8fTJ06FSYmJliyZAneeOMN7N+/Hy1btgQAfPDBB1i7di3Cw8Ph4eGBW7du4cCBAzh37hyaNWtW6naIi4tD27ZtoVarMWHCBJiamuLHH39E+/btsXfvXrRq1Qo9e/aEjY0NxowZg759++LNN9+ElZWVwdu8NHfu3EGnTp3Qs2dP9O7dG2vXrsXEiRPRpEkTdO7cudTptm/fjpCQEHh4eGDmzJm4deuW9Af7aRm6n4vY2dnhhx9+wPDhw9GjRw/07NkTAODl5QUA2LFjBzp37oy6deti2rRpuHfvHr777ju0bt0ax48fLzU8JiYm4o033oCtrS2ioqJQo0YN5OTkwN/fH9euXcOwYcNQp04dHDp0CJGRkbh+/TrmzZunN48VK1bg7t27GDZsGBQKBWbPno2ePXvi33//lc50hISEIC4uDiNHjoSrqyvS09MRFRWF5OTkp7qh98CBA1i3bh1GjBgBa2trzJ8/HyEhIUhOTkb16tVLnKZRo0aYMWMGpkyZgqFDh0q/R1577TWpT2k/t0uXLoWVlRXGjh0LKysr7Nq1C1OmTIFWq8WcOXOeWG9ZttHT+vnnnzFs2DC89tprGD16NP7991+89dZbsLW1Re3ataV+Op0Ob731Fg4cOIChQ4eiUaNGOH36NObOnYsLFy5IDwT06dMH06ZNQ2pqKhwdHaXpDxw4gJSUFLzzzjul1pKeno7AwEDY2dlh0qRJsLGxwaVLl7Bu3Tq9fsOGDcPSpUsxaNAgjBo1CklJSfj+++9x4sQJHDx4EKamppg3bx5GjhwJKysrfPzxxwCg93v0pSVI1jIyMgQA0a1bt8f2e+uttwQAodVqhRBCDBgwQJiYmIiYmJhifXU6nRBCiClTpggAYt26daX2WbJkiQAgkpKS9Np3794tAIjdu3dL4/z9/QUAsWjRIr2+SUlJAoBQq9UiPT1dr61Dhw6iSZMmIjc3V2/Zr732mqhfv740rqiOgIAAqTYhhBgzZoyoUqWKyMjIEEI82F7W1taiVatW4t69eyWuk06nE/Xr1xdBQUF688rJyRFubm6iY8eO0jiNRiPCwsKKbZ8n6d69u1AqlSIxMVEal5KSIqytrUW7du2KbZs5c+Y8cZ5Psy/+97//SePy8vKEo6OjCAkJKbb8JUuWSOOaNm0qnJycpG0qhBDbt28XAISLi8tjl1vaPMu6n0ty48YNAUBMnTq1WFvTpk2Fvb29uHXrljTu5MmTwsTERAwYMEAaN3XqVAFA3LhxQ5w7d044OzuLFi1aiNu3b0t9Pv30U2FpaSkuXLigt4xJkyaJKlWqiOTkZL31q169ut70f/75pwAgNm7cKIQQ4s6dO2Xet48qqvdhAIRSqRQXL17UW1cA4rvvvnvs/GJiYortkyKl/dwK8eBn4lHDhg0TFhYWevsyNDRU79go6zYqjYuLS4n7u0h+fr6wt7cXTZs2FXl5edL4xYsXCwDC399fGvfrr78KExMTsX//fr15LFq0SAAQBw8eFEIIER8fX+K2HDFihLCysipxWxRZv369AFDi79si+/fvFwDE8uXL9cZv3bq12HhPT0+9dSAheBlL5u7evQsAsLa2fmy/onatVgudTocNGzaga9euJT7ZU3R6/I8//oC3t3eJ/1U/7WOvKpUKgwYNKrEtJCRE7zLc7du3sWvXLvTu3Rt3797FzZs3cfPmTdy6dQtBQUFISEjAtWvX9OYxdOhQvdratm2LwsJCXL58GQAQFRWFu3fvYtKkScXuRSmaLjY2FgkJCXj33Xdx69YtabnZ2dno0KED9u3bJ122sLGxwZEjR5CSklLmbVBYWIjt27eje/fuevdQOTk54d1338WBAweg1WrLPL+nZWVlhffee0/6XqlUomXLlvj3339Lneb69euIjY1FaGgoNBqNNL5jx47w8PB4qjqeZj+XRVGtAwcOhK2trTTey8sLHTt2xObNm4tNc+bMGfj7+8PV1RU7duxAtWrVpLY1a9agbdu2qFatmlTjzZs3ERAQgMLCQuzbt09vXn369NGbvuiMSdH2NTc3h1KpxJ49e/QuUzyLgIAAuLu7662rWq1+7D4ti9J+bs3NzaWvi/Zd27ZtkZOTg/Pnzz9xvk/aRsCDS0wPb++bN29Cp9MhJyen2PgiR48eRXp6Oj744AO9+/4GDhyod9wCD/Zro0aN0LBhQ715vfHGGwCA3bt3AwBeeeUVNG3aVO8scWFhIdauXYuuXbvqbYtHFd1zt2nTJhQUFJTYZ82aNdBoNOjYsaNeHT4+PrCyspLqoJLxMpbMFYWYotBTmodD0Y0bN6DVatG4cePHTpOYmIiQkJDyKfT/q1mzZqk3Hbu5uel9f/HiRQgh8Mknn+CTTz4pcZr09HTUrFlT+r5OnTp67UW/SIv+mCQmJgLAY9c9ISEBwINHvkuTmZmJatWqYfbs2QgNDUXt2rXh4+ODN998EwMGDHjsjeA3btxATk4OGjRoUKytUaNG0Ol0uHLlCjw9PUudR3moVatWsdBarVo1nDp1qtRpikJj/fr1i7U1aNCgTPcqPepp9nNZFNVa2nbetm0bsrOzYWlpKY3v2rUrHBwcsG3btmKXDBMSEnDq1Kli98U9XOPDnnQsqlQqfPnllxg3bhwcHBzg6+uLLl26YMCAAXqXSQzx6DKLlvusYaq0n9u4uDhMnjwZu3btKhbQMzMznzjfJ20jAFi5cmWJQWvOnDnFLpUJIQCUfpyampoW+9lMSEjAuXPnyrRf+/Tpg48++gjXrl1DzZo1sWfPHqSnp6NPnz6PXU9/f3+EhIRg+vTpmDt3Ltq3b4/u3bvj3XffhUqlkurIzMyEvb39E+ug4hh2ZE6j0cDJyemxf6AA4NSpU6hZsybUajXu3btXbssv7QzPozcoFnncfz+PthWdPRk/fjyCgoJKnKZevXp631epUqXEfkW/BMuiaLlz5swp9THcoj+EvXv3Rtu2bbF+/Xps374dc+bMwZdffol169Y99r6XimDoviiPbVUe9TzNfq4oISEhWLZsGZYvX45hw4bptel0OnTs2BETJkwocdpXXnlF7/uybN/Ro0eja9eu2LBhA7Zt24ZPPvkEM2fOxK5du/Dqq68aXH9F7dOSfm4zMjLg7+8PtVqNGTNmwN3dHWZmZjh+/DgmTpxYpkfNy1JvUFAQoqKi9Nrfe+89BAYGYsCAAQauSXE6nQ5NmjTBN998U2L7w/f39OnTB5GRkVizZg1Gjx6N1atXQ6PRoFOnTo9dhkKhwNq1a3H48GFs3LgR27Ztw+DBg/H111/j8OHDsLKygk6ng729PZYvX17iPEoLY/QAw85LoEuXLvjpp59w4MABtGnTplj7/v37cenSJemXt52dHdRqNc6cOfPY+bq7uz+xT9F/YhkZGXrji/6zehZF/4GZmpoiICDgmecHQDrFf+bMmVL/gBb1UavVZVquk5MTRowYgREjRiA9PR3NmjXD559/XmrYsbOzg4WFBeLj44u1nT9/HiYmJnq/YMuqIvdFkaKnw4rOfj3s0fUpaz3Pup9LC1VFtZa2nWvUqKF3Vgd4EHCrVq0q3eBbdMM68OC4yMrKKrdj8eH5jhs3DuPGjUNCQgKaNm2Kr7/+Gr/99lu5Ludxnuay9J49e3Dr1i2sW7cO7dq1k8YnJSWVZ2lwcnKCk5OT3jgzMzPUrVu31H3x8HFadDkKAAoKCpCUlARvb29pnLu7O06ePIkOHTo8cTu4ubmhZcuW+P333xEeHo5169ahe/fu0tmZJ/H19YWvry8+//xzrFixAv369cOqVavwn//8B+7u7tixYwdat2792H8Kgae/jUDOeM/OSyAiIgLm5uYYNmwYbt26pdd2+/ZtfPDBB7CwsEBERAQAwMTEBN27d8fGjRtx9OjRYvMr+q8qJCQEJ0+exPr160vtUxQMHr5fobCwEIsXL37m9bK3t0f79u3x448/4vr168XaS3qk/EkCAwNhbW2NmTNnIjc3V6+taJ18fHzg7u6Or776CllZWaUut7CwsNipent7ezg7O+s9svqoKlWqIDAwEH/++afeI8ZpaWlYsWIF2rRpA7VabfC6VeS+KOLk5ISmTZti2bJleuseFRUlvSqgiIuLC6pUqVLsXpaFCxfqff+s+7nofS+PhqqHa3247cyZM9i+fTvefPPNYvNSKBRYvHgxevXqhdDQUL1H33v37o3o6Ghs27at2HQZGRm4f//+Y+t8VE5OTrFj0N3dHdbW1o89fipCUeh7dBs+TtFZmYfPwuTn5xfbv8bQvHlz2NnZYdGiRXqv0li6dGmxdezduzeuXbuGn376qdh87t27h+zsbL1xffr0weHDh/HLL7/g5s2bT7yEBTy4LPfo2bWis8ZF+7p3794oLCzEp59+Wmz6+/fv69VtaWlp0L56GfDMzkugfv36WLZsGfr164cmTZoUe4PyzZs3sXLlSr0bF7/44gts374d/v7+0uOW169fx5o1a3DgwAHY2NggIiICa9euxdtvv43BgwfDx8cHt2/fxl9//YVFixbB29sbnp6e8PX1RWRkJG7fvg1bW1usWrXK4F/8pVmwYAHatGmDJk2a4P3330fdunWRlpaG6OhoXL16FSdPnjRofmq1GnPnzsV//vMftGjRAu+++y6qVauGkydPIicnB8uWLYOJiQn++9//onPnzvD09MSgQYNQs2ZNXLt2Dbt374ZarcbGjRtx9+5d1KpVC7169YK3tzesrKywY8cOxMTE4Ouvv35sHZ999hmioqLQpk0bjBgxAlWrVsWPP/6IvLy8Yu+6KauK3hdFZs6cieDgYLRp0waDBw/G7du38d1338HT01MvHGo0Grz99tv47rvvoFAo4O7ujk2bNpV478Gz7Gdzc3N4eHjg999/xyuvvAJbW1s0btwYjRs3xpw5c9C5c2f4+flhyJAh0qPnGo2m1BfSmZiY4LfffkP37t3Ru3dvbN68GW+88QYiIiLw119/oUuXLhg4cCB8fHyQnZ2N06dPY+3atbh06ZJBr2O4cOECOnTogN69e8PDwwNVq1bF+vXrkZaW9tjHmCuCu7s7bGxssGjRIlhbW8PS0hKtWrUqdh/dw1577TVUq1YNoaGhGDVqFBQKBX799ddyuwz6LExNTfHZZ59h2LBheOONN9CnTx8kJSVhyZIlxe7Z6d+/P1avXo0PPvgAu3fvRuvWrVFYWIjz589j9erV2LZtm96DHL1798b48eMxfvx42NralulM37Jly7Bw4UL06NED7u7uuHv3Ln766Seo1WopdPv7+2PYsGGYOXMmYmNjERgYCFNTUyQkJGDNmjX49ttvpXf5+Pj44IcffsBnn32GevXqwd7eXu8M1kvJGI+AkXGcOnVK9O3bVzg5OQlTU1Ph6Ogo+vbtK06fPl1i/8uXL4sBAwYIOzs7oVKpRN26dUVYWJjeo5q3bt0S4eHhombNmkKpVIpatWqJ0NBQcfPmTalPYmKiCAgIECqVSjg4OIiPPvpIREVFlfi4s6enZ7E6nvR4dWJiohgwYIBwdHQUpqamombNmqJLly5i7dq1Up+ix64ffbSztMef//rrL/Haa68Jc3NzoVarRcuWLcXKlSv1+pw4cUL07NlTVK9eXahUKuHi4iJ69+4tdu7cKYR48Kh2RESE8Pb2FtbW1sLS0lJ4e3uLhQsXlrgejzp+/LgICgoSVlZWwsLCQrz++uvi0KFDBm2bkrbVs+yL0h4RfvSR5D/++EM0atRIqFQq4eHhIdatW1dsWiEePBYeEhIiLCwsRLVq1cSwYcPEmTNnSpxnWfZzaQ4dOiR8fHyEUqks9hj6jh07ROvWraV93bVrV3H27Fm96R9+9LxITk6O8Pf3F1ZWVuLw4cNCCCHu3r0rIiMjRb169YRSqRQ1atQQr732mvjqq69Efn6+3jYraZ89XNvNmzdFWFiYaNiwobC0tBQajUa0atVKrF69+onrW9qj5yW9BsHFxUWEhoY+cZ5//vmn8PDwEFWrVtXbP6UdK0IIcfDgQeHr6yvMzc2Fs7OzmDBhgti2bVux46204+pJ26g0T3r0vMjChQuFm5ubUKlUonnz5mLfvn3C39+/2GPb+fn54ssvvxSenp5CpVKJatWqCR8fHzF9+nSRmZlZbL6tW7cWAMR//vOfJ9YgxIOf9b59+4o6deoIlUol7O3tRZcuXcTRo0eL9V28eLHw8fER5ubmwtraWjRp0kRMmDBBpKSkSH1SU1NFcHCwsLa2LvYo/ctKIUQliNlEJHsDBw7Enj17nvj2XyKi8sZ7doiIiEjWGHaIiIhI1hh2iIiISNZ4zw4RERHJGs/sEBERkawx7BAREZGs8aWCePDZJykpKbC2tuZrtomIiF4QQgjcvXsXzs7OMDEp/fwNww6AlJSUp/qsISIiIjK+K1euoFatWqW2M+wAsLa2BvBgYz3NZw4RERHR86fValG7dm3p73hpGHbwf58Qq1arGXaIiIheME+6BYU3KBMREZGsGTXsuLq6QqFQFBvCwsIAALm5uQgLC0P16tVhZWWFkJAQpKWl6c0jOTkZwcHBsLCwgL29PSIiIsr9U5yJiIjoxWXUsBMTE4Pr169LQ1RUFADg7bffBgCMGTMGGzduxJo1a7B3716kpKSgZ8+e0vSFhYUIDg5Gfn4+Dh06hGXLlmHp0qWYMmWKUdaHiIiIKp9K9Qbl0aNHY9OmTUhISIBWq4WdnR1WrFiBXr16AQDOnz+PRo0aITo6Gr6+vtiyZQu6dOmClJQUODg4AAAWLVqEiRMn4saNG1AqlWVarlarhUajQWZmJu/ZISIiekGU9e93pblnJz8/H7/99hsGDx4MhUKBY8eOoaCgAAEBAVKfhg0bok6dOoiOjgYAREdHo0mTJlLQAYCgoCBotVrExcWVuqy8vDxotVq9gYiIiOSp0oSdDRs2ICMjAwMHDgQApKamQqlUwsbGRq+fg4MDUlNTpT4PB52i9qK20sycORMajUYa+I4dIiIi+ao0Yefnn39G586d4ezsXOHLioyMRGZmpjRcuXKlwpdJRERExlEp3rNz+fJl7NixA+vWrZPGOTo6Ij8/HxkZGXpnd9LS0uDo6Cj1+eeff/TmVfS0VlGfkqhUKqhUqnJcAyIiIqqsKsWZnSVLlsDe3h7BwcHSOB8fH5iammLnzp3SuPj4eCQnJ8PPzw8A4Ofnh9OnTyM9PV3qExUVBbVaDQ8Pj+e3AkRERFRpGf3Mjk6nw5IlSxAaGoqqVf+vHI1GgyFDhmDs2LGwtbWFWq3GyJEj4efnB19fXwBAYGAgPDw80L9/f8yePRupqamYPHkywsLCeOaGiIiIAFSCsLNjxw4kJydj8ODBxdrmzp0LExMThISEIC8vD0FBQVi4cKHUXqVKFWzatAnDhw+Hn58fLC0tERoaihkzZjzPVSAiIqJKrFK9Z8dY+J4dIiKiF88L954dIiIioorAsENERESyxrBDREREsmb0G5RfJK6T/jao/6VZwU/uRERERBWKZ3aIiIhI1hh2iIiISNYYdoiIiEjWGHaIiIhI1hh2iIiISNYYdoiIiEjWGHaIiIhI1hh2iIiISNYYdoiIiEjWGHaIiIhI1hh2iIiISNYYdoiIiEjWGHaIiIhI1hh2iIiISNYYdoiIiEjWGHaIiIhI1hh2iIiISNYYdoiIiEjWGHaIiIhI1hh2iIiISNYYdoiIiEjWGHaIiIhI1hh2iIiISNYYdoiIiEjWGHaIiIhI1hh2iIiISNYYdoiIiEjWGHaIiIhI1hh2iIiISNYYdoiIiEjWGHaIiIhI1hh2iIiISNYYdoiIiEjWGHaIiIhI1hh2iIiISNYYdoiIiEjWGHaIiIhI1owedq5du4b33nsP1atXh7m5OZo0aYKjR49K7UIITJkyBU5OTjA3N0dAQAASEhL05nH79m3069cParUaNjY2GDJkCLKysp73qhAREVElZNSwc+fOHbRu3RqmpqbYsmULzp49i6+//hrVqlWT+syePRvz58/HokWLcOTIEVhaWiIoKAi5ublSn379+iEuLg5RUVHYtGkT9u3bh6FDhxpjlYiIiKiSUQghhLEWPmnSJBw8eBD79+8vsV0IAWdnZ4wbNw7jx48HAGRmZsLBwQFLly7FO++8g3PnzsHDwwMxMTFo3rw5AGDr1q148803cfXqVTg7Oz+xDq1WC41Gg8zMTKjV6lL7uU7626D1uzQr2KD+REREVHZl/ftt1DM7f/31F5o3b463334b9vb2ePXVV/HTTz9J7UlJSUhNTUVAQIA0TqPRoFWrVoiOjgYAREdHw8bGRgo6ABAQEAATExMcOXKkxOXm5eVBq9XqDURERCRPRg07//77L3744QfUr18f27Ztw/DhwzFq1CgsW7YMAJCamgoAcHBw0JvOwcFBaktNTYW9vb1ee9WqVWFrayv1edTMmTOh0WikoXbt2uW9akRERFRJGDXs6HQ6NGvWDF988QVeffVVDB06FO+//z4WLVpUocuNjIxEZmamNFy5cqVCl0dERETGY9Sw4+TkBA8PD71xjRo1QnJyMgDA0dERAJCWlqbXJy0tTWpzdHREenq6Xvv9+/dx+/Ztqc+jVCoV1Gq13kBERETyZNSw07p1a8THx+uNu3DhAlxcXAAAbm5ucHR0xM6dO6V2rVaLI0eOwM/PDwDg5+eHjIwMHDt2TOqza9cu6HQ6tGrV6jmsBREREVVmVY258DFjxuC1117DF198gd69e+Off/7B4sWLsXjxYgCAQqHA6NGj8dlnn6F+/fpwc3PDJ598AmdnZ3Tv3h3AgzNBnTp1ki5/FRQUIDw8HO+8806ZnsQiIiIieTNq2GnRogXWr1+PyMhIzJgxA25ubpg3bx769esn9ZkwYQKys7MxdOhQZGRkoE2bNti6dSvMzMykPsuXL0d4eDg6dOgAExMThISEYP78+cZYJSIiIqpkjPqencqC79khIiJ68bwQ79khIiIiqmgMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGtGDTvTpk2DQqHQGxo2bCi15+bmIiwsDNWrV4eVlRVCQkKQlpamN4/k5GQEBwfDwsIC9vb2iIiIwP3795/3qhAREVElVdXYBXh6emLHjh3S91Wr/l9JY8aMwd9//401a9ZAo9EgPDwcPXv2xMGDBwEAhYWFCA4OhqOjIw4dOoTr169jwIABMDU1xRdffPHc14WIiIgqH6OHnapVq8LR0bHY+MzMTPz8889YsWIF3njjDQDAkiVL0KhRIxw+fBi+vr7Yvn07zp49ix07dsDBwQFNmzbFp59+iokTJ2LatGlQKpXPe3WIiIiokjH6PTsJCQlwdnZG3bp10a9fPyQnJwMAjh07hoKCAgQEBEh9GzZsiDp16iA6OhoAEB0djSZNmsDBwUHqExQUBK1Wi7i4uOe7IkRERFQpGfXMTqtWrbB06VI0aNAA169fx/Tp09G2bVucOXMGqampUCqVsLGx0ZvGwcEBqampAIDU1FS9oFPUXtRWmry8POTl5Unfa7XaclojIiIiqmyMGnY6d+4sfe3l5YVWrVrBxcUFq1evhrm5eYUtd+bMmZg+fXqFzZ+IiIgqD6NfxnqYjY0NXnnlFVy8eBGOjo7Iz89HRkaGXp+0tDTpHh9HR8diT2cVfV/SfUBFIiMjkZmZKQ1Xrlwp3xUhIiKiSqNShZ2srCwkJibCyckJPj4+MDU1xc6dO6X2+Ph4JCcnw8/PDwDg5+eH06dPIz09XeoTFRUFtVoNDw+PUpejUqmgVqv1BiIiIpIno17GGj9+PLp27QoXFxekpKRg6tSpqFKlCvr27QuNRoMhQ4Zg7NixsLW1hVqtxsiRI+Hn5wdfX18AQGBgIDw8PNC/f3/Mnj0bqampmDx5MsLCwqBSqYy5akRERFRJGDXsXL16FX379sWtW7dgZ2eHNm3a4PDhw7CzswMAzJ07FyYmJggJCUFeXh6CgoKwcOFCafoqVapg06ZNGD58OPz8/GBpaYnQ0FDMmDHDWKtERERElYxCCCGMXYSxabVaaDQaZGZmPvaSluukvw2a76VZwc9aGhEREZWirH+/K9U9O0RERETljWGHiIiIZI1hh4iIiGSNYYeIiIhkjWGHiIiIZI1hh4iIiGSNYYeIiIhkjWGHiIiIZI1hh4iIiGSNYYeIiIhkjWGHiIiIZI1hh4iIiGSNYYeIiIhkjWGHiIiIZI1hh4iIiGSNYYeIiIhkjWGHiIiIZI1hh4iIiGSNYYeIiIhkjWGHiIiIZI1hh4iIiGSNYYeIiIhkjWGHiIiIZI1hh4iIiGSNYYeIiIhkjWGHiIiIZI1hh4iIiGSNYYeIiIhk7ZnDjlarxYYNG3Du3LnyqIeIiIioXBkcdnr37o3vv/8eAHDv3j00b94cvXv3hpeXF/74449yL5CIiIjoWRgcdvbt24e2bdsCANavXw8hBDIyMjB//nx89tln5V4gERER0bMwOOxkZmbC1tYWALB161aEhITAwsICwcHBSEhIKPcCiYiIiJ6FwWGndu3aiI6ORnZ2NrZu3YrAwEAAwJ07d2BmZlbuBRIRERE9i6qGTjB69Gj069cPVlZWcHFxQfv27QE8uLzVpEmT8q6PiIiI6JkYHHZGjBiBli1b4sqVK+jYsSNMTB6cHKpbty7v2SEiIqJKx+CwAwDNmzdH8+bN9cYFBweXS0FERERE5cngsFNYWIilS5di586dSE9Ph06n02vftWtXuRVHRERE9KwMDjsffvghli5diuDgYDRu3BgKhaIi6iIiIiIqFwaHnVWrVmH16tV48803K6IeIiIionJl8KPnSqUS9erVq4haiIiIiMqdwWFn3Lhx+PbbbyGEqIh6iIiIiMqVwWHnwIEDWL58Odzd3dG1a1f07NlTb3has2bNgkKhwOjRo6Vxubm5CAsLQ/Xq1WFlZYWQkBCkpaXpTZecnIzg4GBYWFjA3t4eERERuH///lPXQURERPJi8D07NjY26NGjR7kWERMTgx9//BFeXl5648eMGYO///4ba9asgUajQXh4OHr27ImDBw8CePBkWHBwMBwdHXHo0CFcv34dAwYMgKmpKb744otyrZGIiIheTAph5OtRWVlZaNasGRYuXIjPPvsMTZs2xbx585CZmQk7OzusWLECvXr1AgCcP38ejRo1QnR0NHx9fbFlyxZ06dIFKSkpcHBwAAAsWrQIEydOxI0bN6BUKstUg1arhUajQWZmJtRqdan9XCf9bdC6XZrFdw8RERFVlLL+/Tb4MhYA3L9/Hzt27MCPP/6Iu3fvAgBSUlKQlZVl8LzCwsIQHByMgIAAvfHHjh1DQUGB3viGDRuiTp06iI6OBgBER0ejSZMmUtABgKCgIGi1WsTFxZW6zLy8PGi1Wr2BiIiI5Mngy1iXL19Gp06dkJycjLy8PHTs2BHW1tb48ssvkZeXh0WLFpV5XqtWrcLx48cRExNTrC01NRVKpRI2NjZ64x0cHJCamir1eTjoFLUXtZVm5syZmD59epnrJCIioheXwWd2PvzwQzRv3hx37tyBubm5NL5Hjx7YuXNnmedz5coVfPjhh1i+fPlz/7T0yMhIZGZmSsOVK1ee6/KJiIjo+TH4zM7+/ftx6NChYvfDuLq64tq1a2Wez7Fjx5Ceno5mzZpJ4woLC7Fv3z58//332LZtG/Lz85GRkaF3dictLQ2Ojo4AAEdHR/zzzz968y16WquoT0lUKhVUKlWZayUiIqIXl8FndnQ6HQoLC4uNv3r1Kqytrcs8nw4dOuD06dOIjY2VhubNm6Nfv37S16ampnpni+Lj45GcnAw/Pz8AgJ+fH06fPo309HSpT1RUFNRqNTw8PAxdNSIiIpIhg8/sBAYGYt68eVi8eDEAQKFQICsrC1OnTjXoIySsra3RuHFjvXGWlpaoXr26NH7IkCEYO3YsbG1toVarMXLkSPj5+cHX11eqxcPDA/3798fs2bORmpqKyZMnIywsjGduiIiICMBThJ2vv/4aQUFB8PDwQG5uLt59910kJCSgRo0aWLlyZbkWN3fuXJiYmCAkJAR5eXkICgrCwoULpfYqVapg06ZNGD58OPz8/GBpaYnQ0FDMmDGjXOsgIiKiF9dTvWfn/v37+P3333Hy5EnpPTn9+vXTu2H5RcL37BAREb14yvr32+AzOytXrkTfvn3Rr18/9OvXT68tIiICc+bMMbxaIiIiogpi8A3Kw4cPx5YtW4qNHzNmDH777bdyKYqIiIiovBgcdpYvX46+ffviwIED0riRI0di9erV2L17d7kWR0RERPSsDA47wcHBWLhwId566y0cO3YMI0aMwLp167B79240bNiwImokIiIiemoG37MDAO+++y4yMjLQunVr2NnZYe/evahXr15510ZERET0zMoUdsaOHVvieDs7O+kTy4t888035VMZERERUTkoU9g5ceJEiePr1asHrVYrtSsUivKrjIiIiKgclCns8MZjIiIielEZfIPyw65evYqrV6+WVy1ERERE5e6pPgh0xowZ0Gg0cHFxgYuLC2xsbPDpp59Cp9NVRI1ERERET83gp7E+/vhj/Pzzz5g1axZat24NADhw4ACmTZuG3NxcfP755+VeJBEREdHTMjjsLFu2DP/973/x1ltvSeO8vLxQs2ZNjBgxgmGHiIiIKhWDL2Pdvn27xJcHNmzYELdv3y6XooiIiIjKi8Fhx9vbG99//32x8d9//z28vb3LpSgiIiKi8mLwZazZs2cjODgYO3bsgJ+fHwAgOjoaV65cwebNm8u9QCIiIqJnYfCZHX9/f1y4cAE9evRARkYGMjIy0LNnT8THx6Nt27YVUSMRERHRUzP4zE5ycjJq165d4o3IycnJqFOnTrkURkRERFQeDD6z4+bmhhs3bhQbf+vWLbi5uZVLUURERETlxeCwI4Qo8TOwsrKyYGZmVi5FEREREZWXMl/GKvrkc4VCgU8++QQWFhZSW2FhIY4cOYKmTZuWe4FEREREz6LMYafok82FEDh9+jSUSqXUplQq4e3tjfHjx5d/hURERETPoMxhp+iTzwcNGoRvv/0WarW6wooiIiIiKi8GP421ZMmSiqiDiIiIqEIYfIMyERER0YuEYYeIiIhkjWGHiIiIZK1MYadZs2a4c+cOAGDGjBnIycmp0KKIiIiIykuZws65c+eQnZ0NAJg+fTqysrIqtCgiIiKi8lKmp7GaNm2KQYMGoU2bNhBC4KuvvoKVlVWJfadMmVKuBRIRERE9izKFnaVLl2Lq1KnYtGkTFAoFtmzZgqpVi0+qUCgYdoiIiKhSKVPYadCgAVatWgUAMDExwc6dO2Fvb1+hhRERERGVB4NfKqjT6SqiDiIiIqIKYXDYAYDExETMmzcP586dAwB4eHjgww8/hLu7e7kWR0RERPSsDH7PzrZt2+Dh4YF//vkHXl5e8PLywpEjR+Dp6YmoqKiKqJGIiIjoqRl8ZmfSpEkYM2YMZs2aVWz8xIkT0bFjx3IrTk5cJ/1t8DSXZgVXQCVEREQvF4PP7Jw7dw5DhgwpNn7w4ME4e/ZsuRRFREREVF4MDjt2dnaIjY0tNj42NpZPaBEREVGlY/BlrPfffx9Dhw7Fv//+i9deew0AcPDgQXz55ZcYO3ZsuRdIRERE9CwMDjuffPIJrK2t8fXXXyMyMhIA4OzsjGnTpmHUqFHlXiARERHRszA47CgUCowZMwZjxozB3bt3AQDW1tblXhgRERFReXiq9+wUYcghIiKiys7gG5TL0w8//AAvLy+o1Wqo1Wr4+flhy5YtUntubi7CwsJQvXp1WFlZISQkBGlpaXrzSE5ORnBwMCwsLGBvb4+IiAjcv3//ea8KERERVVJGDTu1atXCrFmzcOzYMRw9ehRvvPEGunXrhri4OADAmDFjsHHjRqxZswZ79+5FSkoKevbsKU1fWFiI4OBg5Ofn49ChQ1i2bBmWLl3KDyMlIiIiiUIIIYxdxMNsbW0xZ84c9OrVC3Z2dlixYgV69eoFADh//jwaNWqE6Oho+Pr6YsuWLejSpQtSUlLg4OAAAFi0aBEmTpyIGzduQKlUlmmZWq0WGo0GmZmZUKvVpfYz9MWAD78UkC8VJCIiKl9l/ftt1DM7DyssLMSqVauQnZ0NPz8/HDt2DAUFBQgICJD6NGzYEHXq1EF0dDQAIDo6Gk2aNJGCDgAEBQVBq9VKZ4dKkpeXB61WqzcQERGRPD1V2AkPD8ft27fLpYDTp0/DysoKKpUKH3zwAdavXw8PDw+kpqZCqVTCxsZGr7+DgwNSU1MBAKmpqXpBp6i9qK00M2fOhEajkYbatWuXy7oQERFR5VPmsHP16lXp6xUrViArKwsA0KRJE1y5cuWpC2jQoAFiY2Nx5MgRDB8+HKGhoRX+sRORkZHIzMyUhmepn4iIiCq3Mj963rBhQ1SvXh2tW7dGbm4urly5gjp16uDSpUsoKCh46gKUSiXq1asHAPDx8UFMTAy+/fZb9OnTB/n5+cjIyNA7u5OWlgZHR0cAgKOjI/755x+9+RU9rVXUpyQqlQoqleqpayYiIqIXR5nP7GRkZGDNmjXw8fGBTqfDm2++iVdeeQV5eXnYtm1bsUfCn5ZOp0NeXh58fHxgamqKnTt3Sm3x8fFITk6Gn58fAMDPzw+nT59Genq61CcqKgpqtRoeHh7lUg8RERG92MocdgoKCtCyZUuMGzcO5ubmOHHiBJYsWYIqVargl19+gZubGxo0aGDQwiMjI7Fv3z5cunQJp0+fRmRkJPbs2YN+/fpBo9FgyJAhGDt2LHbv3o1jx45h0KBB8PPzg6+vLwAgMDAQHh4e6N+/P06ePIlt27Zh8uTJCAsL45kbIiIiAmDAZSwbGxs0bdoUrVu3Rn5+Pu7du4fWrVujatWq+P3331GzZk3ExMQYtPD09HQMGDAA169fh0ajgZeXF7Zt24aOHTsCAObOnQsTExOEhIQgLy8PQUFBWLhwoTR9lSpVsGnTJgwfPhx+fn6wtLREaGgoZsyYYVAdREREJF9lDjvXrl1DdHQ0Dh06hPv378PHxwctWrRAfn4+jh8/jlq1aqFNmzYGLfznn39+bLuZmRkWLFiABQsWlNrHxcUFmzdvNmi5RERE9PIo82WsGjVqoGvXrpg5cyYsLCwQExODkSNHQqFQYPz48dBoNPD396/IWomIiIgM9tQvFdRoNOjduzdMTU2xa9cuJCUlYcSIEeVZGxEREdEze6pPPT916hRq1qwJ4MFlJFNTUzg6OqJPnz7lWhwRERHRs3qqsPPwG4fPnDlTbsUQERERlbdK89lYRERERBWBYYeIiIhkjWGHiIiIZI1hh4iIiGSNYYeIiIhkjWGHiIiIZI1hh4iIiGSNYYeIiIhkjWGHiIiIZI1hh4iIiGSNYYeIiIhkjWGHiIiIZI1hh4iIiGSNYYeIiIhkjWGHiIiIZI1hh4iIiGSNYYeIiIhkjWGHiIiIZI1hh4iIiGSNYYeIiIhkjWGHiIiIZI1hh4iIiGSNYYeIiIhkjWGHiIiIZI1hh4iIiGSNYYeIiIhkjWGHiIiIZI1hh4iIiGSNYYeIiIhkjWGHiIiIZI1hh4iIiGSNYYeIiIhkjWGHiIiIZI1hh4iIiGSNYYeIiIhkjWGHiIiIZM2oYWfmzJlo0aIFrK2tYW9vj+7duyM+Pl6vT25uLsLCwlC9enVYWVkhJCQEaWlpen2Sk5MRHBwMCwsL2NvbIyIiAvfv33+eq0JERESVlFHDzt69exEWFobDhw8jKioKBQUFCAwMRHZ2ttRnzJgx2LhxI9asWYO9e/ciJSUFPXv2lNoLCwsRHByM/Px8HDp0CMuWLcPSpUsxZcoUY6wSERERVTIKIYQwdhFFbty4AXt7e+zduxft2rVDZmYm7OzssGLFCvTq1QsAcP78eTRq1AjR0dHw9fXFli1b0KVLF6SkpMDBwQEAsGjRIkycOBE3btyAUql84nK1Wi00Gg0yMzOhVqtL7ec66W+D1ufSrOCnnvbR6YmIiEhfWf9+V6p7djIzMwEAtra2AIBjx46hoKAAAQEBUp+GDRuiTp06iI6OBgBER0ejSZMmUtABgKCgIGi1WsTFxZW4nLy8PGi1Wr2BiIiI5KnShB2dTofRo0ejdevWaNy4MQAgNTUVSqUSNjY2en0dHByQmpoq9Xk46BS1F7WVZObMmdBoNNJQu3btcl4bIiIiqiwqTdgJCwvDmTNnsGrVqgpfVmRkJDIzM6XhypUrFb5MIiIiMo6qxi4AAMLDw7Fp0ybs27cPtWrVksY7OjoiPz8fGRkZemd30tLS4OjoKPX5559/9OZX9LRWUZ9HqVQqqFSqcl4LIiIiqoyMemZHCIHw8HCsX78eu3btgpubm167j48PTE1NsXPnTmlcfHw8kpOT4efnBwDw8/PD6dOnkZ6eLvWJioqCWq2Gh4fH81kRIiIiqrSMemYnLCwMK1aswJ9//glra2vpHhuNRgNzc3NoNBoMGTIEY8eOha2tLdRqNUaOHAk/Pz/4+voCAAIDA+Hh4YH+/ftj9uzZSE1NxeTJkxEWFiarszfP8iQYERHRy8yoYeeHH34AALRv315v/JIlSzBw4EAAwNy5c2FiYoKQkBDk5eUhKCgICxculPpWqVIFmzZtwvDhw+Hn5wdLS0uEhoZixowZz2s1iIiIqBIzatgpyyt+zMzMsGDBAixYsKDUPi4uLti8eXN5lkZEREQyUWmexiIiIiKqCAw7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQka1WNXQBVPNdJfxs8zaVZwRVQCRER0fPHMztEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrDDtEREQkaww7REREJGsMO0RERCRrRg07+/btQ9euXeHs7AyFQoENGzbotQshMGXKFDg5OcHc3BwBAQFISEjQ63P79m3069cParUaNjY2GDJkCLKysp7jWhAREVFlZtSwk52dDW9vbyxYsKDE9tmzZ2P+/PlYtGgRjhw5AktLSwQFBSE3N1fq069fP8TFxSEqKgqbNm3Cvn37MHTo0Oe1CkRERFTJVTXmwjt37ozOnTuX2CaEwLx58zB58mR069YNAPC///0PDg4O2LBhA9555x2cO3cOW7duRUxMDJo3bw4A+O677/Dmm2/iq6++grOz83NbFyIiIqqcKu09O0lJSUhNTUVAQIA0TqPRoFWrVoiOjgYAREdHw8bGRgo6ABAQEAATExMcOXKk1Hnn5eVBq9XqDURERCRPlTbspKamAgAcHBz0xjs4OEhtqampsLe312uvWrUqbG1tpT4lmTlzJjQajTTUrl27nKsnIiKiyqLShp2KFBkZiczMTGm4cuWKsUsiIiKiClJpw46joyMAIC0tTW98Wlqa1Obo6Ij09HS99vv37+P27dtSn5KoVCqo1Wq9gYiIiOSp0oYdNzc3ODo6YufOndI4rVaLI0eOwM/PDwDg5+eHjIwMHDt2TOqza9cu6HQ6tGrV6rnXTERERJWPUZ/GysrKwsWLF6Xvk5KSEBsbC1tbW9SpUwejR4/GZ599hvr168PNzQ2ffPIJnJ2d0b17dwBAo0aN0KlTJ7z//vtYtGgRCgoKEB4ejnfeeYdPYhEREREAI4edo0eP4vXXX5e+Hzt2LAAgNDQUS5cuxYQJE5CdnY2hQ4ciIyMDbdq0wdatW2FmZiZNs3z5coSHh6NDhw4wMTFBSEgI5s+f/9zXhYiIiCono4ad9u3bQwhRartCocCMGTMwY8aMUvvY2tpixYoVFVEeERERyYBRww69GFwn/W1Q/0uzgiuoEiIiIsNV2huUiYiIiMoDww4RERHJGsMOERERyRrDDhEREckaww4RERHJGsMOERERyRrDDhEREckaww4RERHJGl8qSBXK0BcSAnwpIRERlS+e2SEiIiJZY9ghIiIiWWPYISIiIllj2CEiIiJZY9ghIiIiWWPYISIiIllj2CEiIiJZ43t2qFIz9D09fEcPERE9imd2iIiISNYYdoiIiEjWGHaIiIhI1hh2iIiISNYYdoiIiEjWGHaIiIhI1hh2iIiISNb4nh2iUvAdP0RE8sAzO0RERCRrPLNDsmXomRmg/M7OGHPZRESkj2d2iIiISNYYdoiIiEjWGHaIiIhI1hh2iIiISNZ4gzJRJcTH3omIyg/DDpHM8EkwIiJ9DDtEVG4YtIioMmLYIaJK41ku3zFoEVFpeIMyERERyRrP7BARgTeFE8kZz+wQERGRrPHMDhERGQXPptHzIpuws2DBAsyZMwepqanw9vbGd999h5YtWxq7LKIXDv8AGe5Zb47mjdlEFUsWYef333/H2LFjsWjRIrRq1Qrz5s1DUFAQ4uPjYW9vb+zyiIgqLYZbehnIIux88803eP/99zFo0CAAwKJFi/D333/jl19+waRJk4xcHRER0f95WQOmMdf7hQ87+fn5OHbsGCIjI6VxJiYmCAgIQHR0tBErIyKSt5f1Epox15uXTJ/OCx92bt68icLCQjg4OOiNd3BwwPnz50ucJi8vD3l5edL3mZmZAACtVvvYZenycgyq7eH5GTrts07PZXPZXDaXLedlN566zeBln5ke9NTTPzzty7rNK+P+LpqnEOLxMxMvuGvXrgkA4tChQ3rjIyIiRMuWLUucZurUqQIABw4cOHDgwEEGw5UrVx6bFV74Mzs1atRAlSpVkJaWpjc+LS0Njo6OJU4TGRmJsWPHSt9nZGTAxcUFycnJ0Gg0FVqvXGi1WtSuXRtXrlyBWq02djkvDG43w3GbPR1uN8Nxmz0dY243IQTu3r0LZ2fnx/Z74cOOUqmEj48Pdu7cie7duwMAdDoddu7cifDw8BKnUalUUKlUxcZrNBoe4AZSq9XcZk+B281w3GZPh9vNcNxmT8dY260sJyle+LADAGPHjkVoaCiaN2+Oli1bYt68ecjOzpaeziIiIqKXlyzCTp8+fXDjxg1MmTIFqampaNq0KbZu3VrspmUiIiJ6+cgi7ABAeHh4qZetnkSlUmHq1KklXtqiknGbPR1uN8Nxmz0dbjfDcZs9nRdhuymEeNLzWkREREQvLn7qOREREckaww4RERHJGsMOERERyRrDDhEREcnaSx92FixYAFdXV5iZmaFVq1b4559/jF1SpTZt2jQoFAq9oWHDhsYuq1LZt28funbtCmdnZygUCmzYsEGvXQiBKVOmwMnJCebm5ggICEBCQoJxiq1EnrTdBg4cWOzY69Spk3GKrSRmzpyJFi1awNraGvb29ujevTvi4+P1+uTm5iIsLAzVq1eHlZUVQkJCir1x/mVSlm3Wvn37YsfaBx98YKSKK4cffvgBXl5e0osD/fz8sGXLFqm9sh9nL3XY+f333zF27FhMnToVx48fh7e3N4KCgpCenm7s0io1T09PXL9+XRoOHDhg7JIqlezsbHh7e2PBggUlts+ePRvz58/HokWLcOTIEVhaWiIoKAi5ubnPudLK5UnbDQA6deqkd+ytXLnyOVZY+ezduxdhYWE4fPgwoqKiUFBQgMDAQGRnZ0t9xowZg40bN2LNmjXYu3cvUlJS0LNnTyNWbVxl2WYA8P777+sda7NnzzZSxZVDrVq1MGvWLBw7dgxHjx7FG2+8gW7duiEuLg7AC3Cclcuncb6gWrZsKcLCwqTvCwsLhbOzs5g5c6YRq6rcpk6dKry9vY1dxgsDgFi/fr30vU6nE46OjmLOnDnSuIyMDKFSqcTKlSuNUGHl9Oh2E0KI0NBQ0a1bN6PU86JIT08XAMTevXuFEA+OLVNTU7FmzRqpz7lz5wQAER0dbawyK5VHt5kQQvj7+4sPP/zQeEW9IKpVqyb++9//vhDH2Ut7Zic/Px/Hjh1DQECANM7ExAQBAQGIjo42YmWVX0JCApydnVG3bl3069cPycnJxi7phZGUlITU1FS9406j0aBVq1Y87spgz549sLe3R4MGDTB8+HDcunXL2CVVKpmZmQAAW1tbAMCxY8dQUFCgd7w1bNgQderU4fH2/z26zYosX74cNWrUQOPGjREZGYmcnBxjlFcpFRYWYtWqVcjOzoafn98LcZzJ5g3Khrp58yYKCwuLfaSEg4MDzp8/b6SqKr9WrVph6dKlaNCgAa5fv47p06ejbdu2OHPmDKytrY1dXqWXmpoKACUed0VtVLJOnTqhZ8+ecHNzQ2JiIj766CN07twZ0dHRqFKlirHLMzqdTofRo0ejdevWaNy4MYAHx5tSqYSNjY1eXx5vD5S0zQDg3XffhYuLC5ydnXHq1ClMnDgR8fHxWLdunRGrNb7Tp0/Dz88Pubm5sLKywvr16+Hh4YHY2NhKf5y9tGGHnk7nzp2lr728vNCqVSu4uLhg9erVGDJkiBErI7l75513pK+bNGkCLy8vuLu7Y8+ePejQoYMRK6scwsLCcObMGd5DZ4DSttnQoUOlr5s0aQInJyd06NABiYmJcHd3f95lVhoNGjRAbGwsMjMzsXbtWoSGhmLv3r3GLqtMXtrLWDVq1ECVKlWK3S2elpYGR0dHI1X14rGxscErr7yCixcvGruUF0LRscXj7tnVrVsXNWrU4LGHB58NuGnTJuzevRu1atWSxjs6OiI/Px8ZGRl6/Xm8lb7NStKqVSsAeOmPNaVSiXr16sHHxwczZ86Et7c3vv322xfiOHtpw45SqYSPjw927twpjdPpdNi5cyf8/PyMWNmLJSsrC4mJiXBycjJ2KS8ENzc3ODo66h13Wq0WR44c4XFnoKtXr+LWrVsv9bEnhEB4eDjWr1+PXbt2wc3NTa/dx8cHpqamesdbfHw8kpOTX9rj7UnbrCSxsbEA8FIfayXR6XTIy8t7MY4zY98hbUyrVq0SKpVKLF26VJw9e1YMHTpU2NjYiNTUVGOXVmmNGzdO7NmzRyQlJYmDBw+KgIAAUaNGDZGenm7s0iqNu3fvihMnTogTJ04IAOKbb74RJ06cEJcvXxZCCDFr1ixhY2Mj/vzzT3Hq1CnRrVs34ebmJu7du2fkyo3rcdvt7t27Yvz48SI6OlokJSWJHTt2iGbNmon69euL3NxcY5duNMOHDxcajUbs2bNHXL9+XRpycnKkPh988IGoU6eO2LVrlzh69Kjw8/MTfn5+RqzauJ60zS5evChmzJghjh49KpKSksSff/4p6tatK9q1a2fkyo1r0qRJYu/evSIpKUmcOnVKTJo0SSgUCrF9+3YhROU/zl7qsCOEEN99952oU6eOUCqVomXLluLw4cPGLqlS69Onj3BychJKpVLUrFlT9OnTR1y8eNHYZVUqu3fvFgCKDaGhoUKIB4+ff/LJJ8LBwUGoVCrRoUMHER8fb9yiK4HHbbecnBwRGBgo7OzshKmpqXBxcRHvv//+S/+PSUnbC4BYsmSJ1OfevXtixIgRolq1asLCwkL06NFDXL9+3XhFG9mTtllycrJo166dsLW1FSqVStSrV09ERESIzMxM4xZuZIMHDxYuLi5CqVQKOzs70aFDBynoCFH5jzOFEEI8v/NIRERERM/XS3vPDhEREb0cGHaIiIhI1hh2iIiISNYYdoiIiEjWGHaIiIhI1hh2iIiISNYYdoiIiEjWGHaIqEKdP38evr6+MDMzQ9OmTY1dDhG9hBh2iAgAcOPGDSiVSmRnZ6OgoACWlpZITk5+5vlOnToVlpaWiI+P1/vsHCKi54Vhh4gAANHR0fD29oalpSWOHz8OW1tb1KlT55nnm5iYiDZt2sDFxQXVq1cvh0ornhAC9+/fLzY+Pz/fCNUQ0bNi2CEiAMChQ4fQunVrAMCBAwekrx9Hp9NhxowZqFWrFlQqFZo2bYqtW7dK7QqFAseOHcOMGTOgUCgwbdq0EueTl5eHUaNGwd7eHmZmZmjTpg1iYmL0+sTFxaFLly5Qq9WwtrZG27ZtkZiYKLX/8ssv8PT0hEqlgpOTE8LDwwEAly5dgkKhkD65GgAyMjKgUCiwZ88eAMCePXugUCiwZcsW+Pj4QKVS4cCBA2jfvj3Cw8MxevRo1KhRA0FBQQCAM2fOoHPnzrCysoKDgwP69++PmzdvSvNv3749Ro0ahQkTJsDW1haOjo7F1j0jIwPDhg2Dg4MDzMzM0LhxY2zatElqP3DgANq2bQtzc3PUrl0bo0aNQnZ2ttS+cOFC1K9fH2ZmZnBwcECvXr2euL+IXlpG/mwuIjKiy5cvC41GIzQajTA1NRVmZmZCo9EIpVIpVCqV0Gg0Yvjw4aVO/8033wi1Wi1Wrlwpzp8/LyZMmCBMTU3FhQsXhBBCXL9+XXh6eopx48aJ69evi7t375Y4n1GjRglnZ2exefNmERcXJ0JDQ0W1atXErVu3hBBCXL16Vdja2oqePXuKmJgYER8fL3755Rdx/vx5IYQQCxcuFGZmZmLevHkiPj5e/PPPP2Lu3LlCCCGSkpIEAHHixAlpeXfu3BEAxO7du4UQ//chpF5eXmL79u3i4sWL4tatW8Lf319YWVmJiIgIcf78eXH+/Hlx584dYWdnJyIjI8W5c+fE8ePHRceOHcXrr78uzd/f31+o1Woxbdo0ceHCBbFs2TK9T4guLCwUvr6+wtPTU2zfvl0kJiaKjRs3is2bNwshHnzytqWlpZg7d664cOGCOHjwoHj11VfFwIEDhRBCxMTEiCpVqogVK1aIS5cuiePHj4tvv/3WwL1P9PJg2CF6iRUUFIikpCRx8uRJYWpqKk6ePCkuXrworKysxN69e0VSUpK4ceNGqdM7OzuLzz//XG9cixYtxIgRI6Tvvb29xdSpU0udR1ZWljA1NRXLly+XxuXn5wtnZ2cxe/ZsIYQQkZGRws3NTeTn55dax8cff1ximyFhZ8OGDXrT+vv7i1dffVVv3KeffioCAwP1xl25ckUAkD693t/fX7Rp00avT4sWLcTEiROFEEJs27ZNmJiYlPpp90OGDBFDhw7VG7d//35hYmIi7t27J/744w+hVquFVqstcXoi0lfViCeViMjIqlatCldXV6xevRotWrSAl5cXDh48CAcHB7Rr1+6x02q1WqSkpBS73NW6dWucPHmyzDUkJiaioKBAbz6mpqZo2bIlzp07BwCIjY1F27ZtYWpqWmz69PR0pKSkoEOHDmVeZmmaN29ebJyPj4/e9ydPnsTu3bthZWVVrG9iYiJeeeUVAICXl5dem5OTE9LT0wE8WJ9atWpJfR918uRJnDp1CsuXL5fGCSGg0+mQlJSEjh07wsXFBXXr1kWnTp3QqVMn9OjRAxYWFoatMNFLgmGH6CXm6emJy5cvo6CgADqdDlZWVrh//z7u378PKysruLi4IC4uzthlwtzc/KnaAMDE5MGtiUIIaVxBQUGJfS0tLZ84LisrC127dsWXX35ZrK+Tk5P09aPBTKFQQKfTlanmrKwsDBs2DKNGjSrWVqdOHSiVShw/fhx79uzB9u3bMWXKFEybNg0xMTGwsbF57LyJXka8QZnoJbZ582bExsbC0dERv/32G2JjY9G4cWPMmzcPsbGx2Lx5c6nTqtVqODs74+DBg3rjDx48CA8PjzLX4O7uDqVSqTefgoICxMTESPPx8vLC/v37Swwp1tbWcHV1LfWxdjs7OwDA9evXpXEP36xsqGbNmiEuLg6urq6oV6+e3lBSWCqJl5cXrl69igsXLpS6jLNnzxabf7169aBUKgE8OCsXEBCA2bNn49SpU7h06RJ27dr11OtFJGcMO0QvMRcXF1hZWSEtLQ3dunVD7dq1ERcXh5CQENSrVw8uLi6PnT4iIgJffvklfv/9d8THx2PSpEmIjY3Fhx9+WOYaLC0tMXz4cERERGDr1q04e/Ys3n//feTk5GDIkCEAgPDwcGi1Wrzzzjs4evQoEhIS8OuvvyI+Ph4AMG3aNHz99deYP38+EhIScPz4cXz33XcAHpxF8fX1xaxZs3Du3Dns3bsXkydPfsotBoSFheH27dvo27cvYmJikJiYiG3btmHQoEEoLCws0zz8/f3Rrl07hISEICoqCklJSdiyZYv0JNvEiRNx6NAhhIeHIzY2FgkJCfjzzz+lJ8w2bdqE+fPnIzY2FpcvX8b//vc/6HQ6NGjQ4KnXi0jOeBmL6CW3Z88etGjRAmZmZti/fz9q1aqldznmcUaNGoXMzEyMGzcO6enp8PDwwF9//YX69esbVMOsWbOg0+nQv39/3L17F82bN8e2bdtQrVo1AED16tWxa9cuREREwN/fH1WqVEHTpk2l+3xCQ0ORm5uLuXPnYvz48ahRo4beo9i//PILhgwZAh8fHzRo0ACzZ89GYGCgQTUWKTqbNXHiRAQGBiIvLw8uLi7o1KmTdMmsLP744w+MHz8effv2RXZ2NurVq4dZs2YBeHDmZ+/evfj444/Rtm1bCCHg7u6OPn36AABsbGywbt06TJs2Dbm5uahfvz5WrlwJT0/Pp1onIrlTiIcvZBMRERHJDC9jERERkawx7BAREZGsMewQERGRrDHsEBERkawx7BAREZGsMewQERGRrDHsEBERkawx7BAREZGsMewQERGRrDHsEBERkawx7BAREZGsMewQERGRrP0/+pDabVArBhQAAAAASUVORK5CYII=",
+            "text/plain": [
+              "<Figure size 640x480 with 1 Axes>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        }
+      ],
       "source": [
         "import matplotlib.pyplot as plt\n",
         "\n",
@@ -607,9 +791,7 @@
         "plt.xlabel(\"# of occurrences\")\n",
         "plt.ylabel(\"# of tokens\")\n",
         "plt.xlim(0, MAX_COUNT);"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -622,9 +804,19 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 25,
       "metadata": {
         "id": "9G6laS0ojV-B"
       },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Number of tokens with <= 5 occurrences : 1041\n"
+          ]
+        }
+      ],
       "source": [
         "UNCOMMON_TOKENS_COUNT = 5\n",
         "\n",
@@ -635,9 +827,7 @@
         "        chars_with_infrequent_occurrence.update(set(token_list))\n",
         "\n",
         "print(f\"Number of tokens with <= {UNCOMMON_TOKENS_COUNT} occurrences : {len(chars_with_infrequent_occurrence)}\")"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -652,9 +842,20 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 26,
       "metadata": {
         "id": "jnh_pnL2jWAY"
       },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Original train+dev+test vocab size : 1433\n",
+            "New train vocab size : 1255\n"
+          ]
+        }
+      ],
       "source": [
         "all_tokens = set.union(train_dev_set, test_set)\n",
         "print(f\"Original train+dev+test vocab size : {len(all_tokens)}\")\n",
@@ -662,9 +863,7 @@
         "extra_kanji = set(test_oov)\n",
         "train_token_set = all_tokens - extra_kanji\n",
         "print(f\"New train vocab size : {len(train_token_set)}\")"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -692,38 +891,48 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 27,
       "metadata": {
-        "id": "kaX9WzK15Q6t",
-        "cellView": "form"
+        "cellView": "form",
+        "id": "kaX9WzK15Q6t"
       },
+      "outputs": [],
       "source": [
         "#@title Dakuten normalization\n",
         "perform_dakuten_normalization = True #@param [\"True\", \"False\"] {type:\"raw\"}\n",
         "PERFORM_DAKUTEN_NORMALIZATION = bool(perform_dakuten_normalization)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 28,
       "metadata": {
         "id": "HiEZVEshOp-y"
       },
+      "outputs": [],
       "source": [
         "import unicodedata\n",
         "def process_dakuten(text):\n",
         "    normalized_text = unicodedata.normalize('NFD', text)\n",
         "    normalized_text = normalized_text.replace(\"\\u3099\", \"\").replace(\"\\u309A\", \"\")\n",
         "    return normalized_text"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 29,
       "metadata": {
         "id": "pV4kOgpvjWGg"
       },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "After dakuten normalization, number of train tokens : 1210\n"
+          ]
+        }
+      ],
       "source": [
         "if PERFORM_DAKUTEN_NORMALIZATION:\n",
         "    normalized_train_token_set = set()\n",
@@ -733,11 +942,8 @@
         "\n",
         "    print(f\"After dakuten normalization, number of train tokens : {len(normalized_train_token_set)}\")\n",
         "else:\n",
-        "    normalized_train_token_set = train_token_set\n",
-        ""
-      ],
-      "execution_count": null,
-      "outputs": []
+        "    normalized_train_token_set = train_token_set\n"
+      ]
     },
     {
       "cell_type": "markdown",
@@ -754,9 +960,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 30,
       "metadata": {
         "id": "NN3asqvsrp_S"
       },
+      "outputs": [],
       "source": [
         "# Preprocessing steps\n",
         "import re\n",
@@ -780,9 +988,7 @@
         "        text = data['text']\n",
         "        data['text'] = process_dakuten(text)\n",
         "    return data"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -797,9 +1003,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 31,
       "metadata": {
         "id": "mwNtHeHLjqJl"
       },
+      "outputs": [],
       "source": [
         "# Processing pipeline\n",
         "def apply_preprocessors(manifest, preprocessors):\n",
@@ -809,15 +1017,15 @@
         "\n",
         "    print(\"Finished processing manifest !\")\n",
         "    return manifest"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 32,
       "metadata": {
         "id": "xB06YHmDr-Ja"
       },
+      "outputs": [],
       "source": [
         "# List of pre-processing functions\n",
         "PREPROCESSORS = [\n",
@@ -825,15 +1033,166 @@
         "    remove_extra_kanji,\n",
         "    remove_dakuten,\n",
         "]"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 33,
       "metadata": {
         "id": "4lqUvpkrr7bQ"
       },
+      "outputs": [
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "1077e49a05ca4aab8a4c6ef5f4b356f6",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Applying remove_special_characters:   0%|          | 0/722 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "b646ff7faf5e4d80b21c3a1ce3f8de87",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Applying remove_extra_kanji:   0%|          | 0/722 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "5895cc18274648a8a40b64d71ceee116",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Applying remove_dakuten:   0%|          | 0/722 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Finished processing manifest !\n"
+          ]
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "cc6874e918ce4c408a69630ac7767788",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Applying remove_special_characters:   0%|          | 0/586 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "d80fd3184828476b95bc80232253a5bb",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Applying remove_extra_kanji:   0%|          | 0/586 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "aefd8a315594441fa06974ff83070f07",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Applying remove_dakuten:   0%|          | 0/586 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Finished processing manifest !\n"
+          ]
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "95ddca797e85407499072648f4bb110c",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Applying remove_special_characters:   0%|          | 0/632 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "5ad49631bc644624bc0cac82279ba2e7",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Applying remove_extra_kanji:   0%|          | 0/632 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "c47b2b803f984a6db9278bb0ad5a575b",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Applying remove_dakuten:   0%|          | 0/632 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Finished processing manifest !\n",
+            "Finished writing manifest: datasets/ja/mozilla-foundation/common_voice_6_1/ja/train/train_mozilla-foundation_common_voice_6_1_manifest_processed.json\n",
+            "Finished writing manifest: datasets/ja/mozilla-foundation/common_voice_6_1/ja/validation/validation_mozilla-foundation_common_voice_6_1_manifest_processed.json\n",
+            "Finished writing manifest: datasets/ja/mozilla-foundation/common_voice_6_1/ja/test/test_mozilla-foundation_common_voice_6_1_manifest_processed.json\n"
+          ]
+        }
+      ],
       "source": [
         "# Load manifests\n",
         "train_data = read_manifest(train_manifest)\n",
@@ -849,9 +1208,7 @@
         "train_manifest_cleaned = write_processed_manifest(train_data_processed, train_manifest)\n",
         "dev_manifest_cleaned = write_processed_manifest(dev_data_processed, dev_manifest)\n",
         "test_manifest_cleaned = write_processed_manifest(test_data_processed, test_manifest)\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -866,9 +1223,40 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 34,
       "metadata": {
         "id": "WpHk6HW6O0FW"
       },
+      "outputs": [
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "e8017c394a5a4798b8996f84d2e688b2",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Computing character set:   0%|          | 0/722 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "01ff88c15ecc458dba01b27665b327ca",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Computing character set:   0%|          | 0/586 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        }
+      ],
       "source": [
         "train_manifest_data = read_manifest(train_manifest_cleaned)\n",
         "train_charset = get_charset(train_manifest_data)\n",
@@ -877,20 +1265,26 @@
         "dev_charset = get_charset(dev_manifest_data)\n",
         "\n",
         "train_dev_set = set.union(set(train_charset.keys()), set(dev_charset.keys()))"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 35,
       "metadata": {
         "id": "R3xkR4_dPd3C"
       },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Number of tokens in preprocessed train+dev set : 1206\n"
+          ]
+        }
+      ],
       "source": [
         "print(f\"Number of tokens in preprocessed train+dev set : {len(train_dev_set)}\")"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -907,14 +1301,14 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "DlJmwh-iei77"
       },
+      "outputs": [],
       "source": [
         "char_model = nemo_asr.models.ASRModel.from_pretrained(\"stt_en_quartznet15x5\", map_location='cpu')"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -929,14 +1323,14 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "1VU-jfYLei9-"
       },
+      "outputs": [],
       "source": [
         "char_model.change_vocabulary(new_vocabulary=list(train_dev_set))"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -959,16 +1353,16 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "6PPDTaLyejAR"
       },
+      "outputs": [],
       "source": [
         "#@title Freeze Encoder { display-mode: \"form\" }\n",
         "freeze_encoder = True #@param [\"False\", \"True\"] {type:\"raw\"}\n",
         "freeze_encoder = bool(freeze_encoder)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -991,9 +1385,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "1qiTTgDGejC9"
       },
+      "outputs": [],
       "source": [
         "import torch\n",
         "import torch.nn as nn\n",
@@ -1008,15 +1404,15 @@
         "        m.train()\n",
         "        for param in m.parameters():\n",
         "            param.requires_grad_(True)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "9I5dx_GWejFm"
       },
+      "outputs": [],
       "source": [
         "if freeze_encoder:\n",
         "  char_model.encoder.freeze()\n",
@@ -1025,9 +1421,7 @@
         "else:\n",
         "  char_model.encoder.unfreeze()\n",
         "  logging.info(\"Model encoder has been un-frozen\")"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1057,14 +1451,14 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "TBIy8p0fV7sa"
       },
+      "outputs": [],
       "source": [
         "char_model.cfg.labels = list(train_dev_set)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1077,14 +1471,14 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "pzpByrdfejIA"
       },
+      "outputs": [],
       "source": [
         "cfg = copy.deepcopy(char_model.cfg)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1101,9 +1495,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "KlQ5iGrZejKy"
       },
+      "outputs": [],
       "source": [
         "# Setup train, validation, test configs\n",
         "with open_dict(cfg):\n",
@@ -1124,22 +1520,20 @@
         "  cfg.validation_ds.num_workers = 8\n",
         "  cfg.validation_ds.pin_memory = True\n",
         "  cfg.validation_ds.trim_silence = True"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "tx9DixV0ejMo"
       },
+      "outputs": [],
       "source": [
         "# setup data loaders with new configs\n",
         "char_model.setup_training_data(cfg.train_ds)\n",
         "char_model.setup_multiple_validation_data(cfg.validation_ds)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1157,21 +1551,23 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "MgoD5hOKYSKJ"
       },
+      "outputs": [],
       "source": [
         "# Original optimizer + scheduler\n",
         "print(OmegaConf.to_yaml(char_model.cfg.optim))"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "okytaslHejOm"
       },
+      "outputs": [],
       "source": [
         "with open_dict(char_model.cfg.optim):\n",
         "  char_model.cfg.optim.lr = 0.01\n",
@@ -1180,9 +1576,7 @@
         "  char_model.cfg.optim.sched.warmup_steps = None  # Remove default number of steps of warmup\n",
         "  char_model.cfg.optim.sched.warmup_ratio = 0.05  # 5 % warmup\n",
         "  char_model.cfg.optim.sched.min_lr = 1e-5"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1204,20 +1598,22 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "aJ6Md-dLejRA"
       },
+      "outputs": [],
       "source": [
         "print(OmegaConf.to_yaml(char_model.cfg.spec_augment))"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "3ei9WsLzejTI"
       },
+      "outputs": [],
       "source": [
         "# with open_dict(char_model.cfg.spec_augment):\n",
         "#   char_model.cfg.spec_augment.freq_masks = 2\n",
@@ -1226,9 +1622,7 @@
         "#   char_model.cfg.spec_augment.time_width = 0.05\n",
         "\n",
         "char_model.spec_augmentation = char_model.from_config_dict(char_model.cfg.spec_augment)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1247,30 +1641,30 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
-        "id": "cN1FC0o2ejVg",
-        "cellView": "form"
+        "cellView": "form",
+        "id": "cN1FC0o2ejVg"
       },
+      "outputs": [],
       "source": [
         "#@title Metric\n",
         "use_cer = True #@param [\"False\", \"True\"] {type:\"raw\"}\n",
         "log_prediction = True #@param [\"False\", \"True\"] {type:\"raw\"}\n",
         "\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "HURZMpPwejXa"
       },
+      "outputs": [],
       "source": [
         "char_model.wer.use_cer = use_cer\n",
         "char_model.wer.log_prediction = log_prediction"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1287,9 +1681,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "eaw1qsQIf1Zv"
       },
+      "outputs": [],
       "source": [
         "import torch\n",
         "import pytorch_lightning as ptl\n",
@@ -1315,15 +1711,15 @@
         "\n",
         "# Finally, update the model's internal config\n",
         "char_model.cfg = char_model._cfg"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "ENSpJJqcf1cG"
       },
+      "outputs": [],
       "source": [
         "# Environment variable generally used for multi-node multi-gpu training.\n",
         "# In notebook environments, this flag is unnecessary and can cause logs of multiple training runs to overwrite each other.\n",
@@ -1343,15 +1739,15 @@
         "config = OmegaConf.structured(config)\n",
         "\n",
         "logdir = exp_manager.exp_manager(trainer, config)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "ATI2R0D7rylR"
       },
+      "outputs": [],
       "source": [
         "try:\n",
         "  from google import colab\n",
@@ -1365,21 +1761,19 @@
         "  %tensorboard --logdir /content/experiments/lang-$LANGUAGE/ASR-Char-Model-Language-$LANGUAGE/\n",
         "else:\n",
         "  print(\"To use tensorboard, please use this notebook in a Google Colab environment.\")"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "TvaESyJHf1eb"
       },
+      "outputs": [],
       "source": [
         "%%time\n",
         "trainer.fit(char_model)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1429,27 +1823,45 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 36,
       "metadata": {
         "id": "yIUQklly9BPa"
       },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "--2024-06-26 19:55:08--  https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tokenizers/process_asr_text_tokenizer.py\n",
+            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...\n",
+            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
+            "HTTP request sent, awaiting response... 200 OK\n",
+            "Length: 16631 (16K) [text/plain]\n",
+            "Saving to: ‘scripts/process_asr_text_tokenizer.py’\n",
+            "\n",
+            "process_asr_text_to 100%[===================>]  16.24K  --.-KB/s    in 0s      \n",
+            "\n",
+            "2024-06-26 19:55:08 (159 MB/s) - ‘scripts/process_asr_text_tokenizer.py’ saved [16631/16631]\n",
+            "\n"
+          ]
+        }
+      ],
       "source": [
         "if not os.path.exists(\"scripts/process_asr_text_tokenizer.py\"):\n",
         "  !wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tokenizers/process_asr_text_tokenizer.py"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 37,
       "metadata": {
         "id": "SKA9rrpbm3nu"
       },
+      "outputs": [],
       "source": [
         "#@title Tokenizer Config { display-mode: \"form\" }\n",
         "TOKENIZER_TYPE = \"bpe\" #@param [\"bpe\", \"unigram\"]"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1464,15 +1876,15 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 38,
       "metadata": {
         "id": "lO_uskUEm2ZG"
       },
+      "outputs": [],
       "source": [
         "# << VOCAB SIZE can be changed to any value larger than (len(train_dev_set) + 2)! >>\n",
         "VOCAB_SIZE = len(train_dev_set) + 2"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1489,9 +1901,88 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 53,
       "metadata": {
         "id": "yT-SBPN2Ox6Y"
       },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "INFO:root:Corpus already exists at path : tokenizers/ja/text_corpus/document.txt\n",
+            "[NeMo I 2024-06-26 20:52:30 sentencepiece_tokenizer:316] Processing tokenizers/ja/text_corpus/document.txt and store at tokenizers/ja/tokenizer_spe_bpe_v1208\n",
+            "sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=tokenizers/ja/text_corpus/document.txt --model_prefix=tokenizers/ja/tokenizer_spe_bpe_v1208/tokenizer --vocab_size=1208 --shuffle_input_sentence=true --hard_vocab_limit=false --model_type=bpe --character_coverage=1.0 --bos_id=-1 --eos_id=-1\n",
+            "sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : \n",
+            "trainer_spec {\n",
+            "  input: tokenizers/ja/text_corpus/document.txt\n",
+            "  input_format: \n",
+            "  model_prefix: tokenizers/ja/tokenizer_spe_bpe_v1208/tokenizer\n",
+            "  model_type: BPE\n",
+            "  vocab_size: 1208\n",
+            "  self_test_sample_size: 0\n",
+            "  character_coverage: 1\n",
+            "  input_sentence_size: 0\n",
+            "  shuffle_input_sentence: 1\n",
+            "  seed_sentencepiece_size: 1000000\n",
+            "  shrinking_factor: 0.75\n",
+            "  max_sentence_length: 4192\n",
+            "  num_threads: 16\n",
+            "  num_sub_iterations: 2\n",
+            "  max_sentencepiece_length: 16\n",
+            "  split_by_unicode_script: 1\n",
+            "  split_by_number: 1\n",
+            "  split_by_whitespace: 1\n",
+            "  split_digits: 0\n",
+            "  pretokenization_delimiter: \n",
+            "  treat_whitespace_as_suffix: 0\n",
+            "  allow_whitespace_only_pieces: 0\n",
+            "  required_chars: \n",
+            "  byte_fallback: 0\n",
+            "  vocabulary_output_piece_score: 1\n",
+            "  train_extremely_large_corpus: 0\n",
+            "  seed_sentencepieces_file: \n",
+            "  hard_vocab_limit: 0\n",
+            "  use_all_vocab: 0\n",
+            "  unk_id: 0\n",
+            "  bos_id: -1\n",
+            "  eos_id: -1\n",
+            "  pad_id: -1\n",
+            "  unk_piece: <unk>\n",
+            "  bos_piece: <s>\n",
+            "  eos_piece: </s>\n",
+            "  pad_piece: <pad>\n",
+            "  unk_surface:  ⁇ \n",
+            "  enable_differential_privacy: 0\n",
+            "  differential_privacy_noise_level: 0\n",
+            "  differential_privacy_clipping_threshold: 0\n",
+            "}\n",
+            "normalizer_spec {\n",
+            "  name: nmt_nfkc\n",
+            "  add_dummy_prefix: 1\n",
+            "  remove_extra_whitespaces: 1\n",
+            "  escape_whitespaces: 1\n",
+            "  normalization_rule_tsv: \n",
+            "}\n",
+            "denormalizer_spec {}\n",
+            "trainer_interface.cc(353) LOG(INFO) SentenceIterator is not specified. Using MultiFileSentenceIterator.\n",
+            "trainer_interface.cc(185) LOG(INFO) Loading corpus: tokenizers/ja/text_corpus/document.txt\n",
+            "trainer_interface.cc(409) LOG(INFO) Loaded all 1308 sentences\n",
+            "trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <unk>\n",
+            "trainer_interface.cc(430) LOG(INFO) Normalizing sentences...\n",
+            "trainer_interface.cc(539) LOG(INFO) all chars count=24630\n",
+            "trainer_interface.cc(560) LOG(INFO) Alphabet size=1207\n",
+            "trainer_interface.cc(561) LOG(INFO) Final character coverage=1\n",
+            "trainer_interface.cc(592) LOG(INFO) Done! preprocessed 1308 sentences.\n",
+            "trainer_interface.cc(598) LOG(INFO) Tokenizing input sentences with whitespace: 1308\n",
+            "trainer_interface.cc(609) LOG(INFO) Done! 1308\n",
+            "trainer_interface.cc(687) LOG(INFO) Saving model: tokenizers/ja/tokenizer_spe_bpe_v1208/tokenizer.model\n",
+            "trainer_interface.cc(699) LOG(INFO) Saving vocabs: tokenizers/ja/tokenizer_spe_bpe_v1208/tokenizer.vocab\n",
+            "Serialized tokenizer at location : tokenizers/ja/tokenizer_spe_bpe_v1208\n",
+            "INFO:root:Done!\n"
+          ]
+        }
+      ],
       "source": [
         "!python scripts/process_asr_text_tokenizer.py \\\n",
         "  --manifest=$train_manifest_cleaned,$dev_manifest_cleaned \\\n",
@@ -1502,21 +1993,78 @@
         "  --spe_character_coverage=1.0 \\\n",
         "  --no_lower_case \\\n",
         "  --log"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 55,
       "metadata": {
         "id": "G5TxLHtKPW4E"
       },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Tokenizer directory : tokenizers/ja/tokenizer_spe_bpe_v1208/\n"
+          ]
+        }
+      ],
       "source": [
         "TOKENIZER_DIR = f\"{tokenizer_dir}/tokenizer_spe_{TOKENIZER_TYPE}_v{VOCAB_SIZE}/\"\n",
         "print(\"Tokenizer directory :\", TOKENIZER_DIR)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 56,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "INFO: Created token '<extra_id_0>' at ID 1208\n",
+            "INFO: Created token '<extra_id_1>' at ID 1209\n",
+            "INFO: Created token '<extra_id_2>' at ID 1210\n",
+            "INFO: Created token '<extra_id_3>' at ID 1211\n",
+            "INFO: Created token '<extra_id_4>' at ID 1212\n",
+            "INFO: Created token '<extra_id_5>' at ID 1213\n",
+            "INFO: Created token '<extra_id_6>' at ID 1214\n",
+            "INFO: Created token '<extra_id_7>' at ID 1215\n",
+            "INFO: New tokenizer vocab size: 1216\n",
+            "INFO: Created new tokenizer at: /workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/ja/tokenizer_spe_bpe_v1208/tokenizer_new.model\n"
+          ]
+        }
+      ],
+      "source": [
+        "# ! protoc --python_out=/workspace/nemo/NeMo-opensource/scripts/tokenizers/ sentencepiece_model.proto\n",
+        "\n",
+        "!python /workspace/nemo/NeMo-opensource/scripts/tokenizers/add_special_tokens_to_sentencepiece.py \\\n",
+        "--input_file /workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/ja/tokenizer_spe_bpe_v1208/tokenizer.model \\\n",
+        "--output_file /workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/ja/tokenizer_spe_bpe_v1208/tokenizer_new.model\\\n",
+        "--is_userdefined \\\n",
+        "--tokens \"<extra_id_0>\" \"<extra_id_1>\" \"<extra_id_2>\" \"<extra_id_3>\" \\\n",
+        "         \"<extra_id_4>\" \"<extra_id_5>\" \"<extra_id_6>\" \"<extra_id_7>\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 57,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import sentencepiece as spm\n",
+        "\n",
+        "sp = spm.SentencePieceProcessor()\n",
+        "sp.load('/workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/ja/tokenizer_spe_bpe_v1208/tokenizer_new.model')\n",
+        "\n",
+        "vocab_list = [sp.id_to_piece(i) for i in range(sp.get_piece_size())]\n",
+        "# Save the vocabulary to a file\n",
+        "# with open('new_tokenizer.vocab', 'w') as vocab_file:\n",
+        "#     for token in vocab_list:\n",
+        "#         vocab_file.write(token + '\\n')\n"
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1531,9 +2079,19 @@
     },
     {
       "cell_type": "code",
+      "execution_count": 52,
       "metadata": {
         "id": "8sAz2_RyMu7J"
       },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Number of tokens :  1208\n"
+          ]
+        }
+      ],
       "source": [
         "# Number of tokens in tokenizer -\n",
         "with open(os.path.join(TOKENIZER_DIR, 'tokenizer.vocab')) as f:\n",
@@ -1541,15 +2099,15 @@
         "\n",
         "num_tokens = len(tokens)\n",
         "print(\"Number of tokens : \", num_tokens)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "zktPYPCxNXNO"
       },
+      "outputs": [],
       "source": [
         "if num_tokens < VOCAB_SIZE:\n",
         "    print(\n",
@@ -1557,9 +2115,7 @@
         "        f\"with vocab size = {VOCAB_SIZE}. Current number of tokens = {num_tokens}. \"\n",
         "        f\"Please reconstruct the tokenizer with fewer tokens\"\n",
         "    )"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1574,14 +2130,14 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "mmSj18iQQTZx"
       },
+      "outputs": [],
       "source": [
         "model = nemo_asr.models.ASRModel.from_pretrained(\"stt_en_citrinet_512\", map_location='cpu')"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1601,15 +2157,15 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "FmFQKwGkoaIx"
       },
+      "outputs": [],
       "source": [
         "# Preserve the decoder parameters in case weight matching can be done later\n",
         "pretrained_decoder = model.decoder.state_dict()"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1624,14 +2180,14 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "-8SKfYSVorgg"
       },
+      "outputs": [],
       "source": [
         "model.change_vocabulary(new_tokenizer_dir=TOKENIZER_DIR, new_tokenizer_type=\"bpe\")"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1648,9 +2204,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "367FBtRDorkT"
       },
+      "outputs": [],
       "source": [
         "# Insert preserved model weights if shapes match\n",
         "if model.decoder.decoder_layers[0].weight.shape == pretrained_decoder['decoder_layers.0.weight'].shape:\n",
@@ -1658,9 +2216,7 @@
         "    logging.info(\"Decoder shapes matched - restored weights from pre-trained model\")\n",
         "else:\n",
         "    logging.info(\"\\nDecoder shapes did not match - could not restore decoder weights from pre-trained model.\")"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1675,22 +2231,24 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "lfDW0gQVpm4d"
       },
+      "outputs": [],
       "source": [
         "#@title Freeze Encoder { display-mode: \"form\" }\n",
         "freeze_encoder = True #@param [\"False\", \"True\"] {type:\"raw\"}\n",
         "freeze_encoder = bool(freeze_encoder)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "oLkm96zkplrX"
       },
+      "outputs": [],
       "source": [
         "if freeze_encoder:\n",
         "  model.encoder.freeze()\n",
@@ -1699,9 +2257,7 @@
         "else:\n",
         "  model.encoder.unfreeze()\n",
         "  logging.info(\"Model encoder has been un-frozen\")"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1718,14 +2274,14 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "pBYAd_2-R2r3"
       },
+      "outputs": [],
       "source": [
         "cfg = copy.deepcopy(model.cfg)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1740,9 +2296,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "NfbtgTC-RyzF"
       },
+      "outputs": [],
       "source": [
         "# Setup new tokenizer\n",
         "cfg.tokenizer.dir = TOKENIZER_DIR\n",
@@ -1750,9 +2308,7 @@
         "\n",
         "# Set tokenizer config\n",
         "model.cfg.tokenizer = cfg.tokenizer"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1769,21 +2325,23 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "wnw-ygClmg7t"
       },
+      "outputs": [],
       "source": [
         "# Setup train/val/test configs\n",
         "print(OmegaConf.to_yaml(cfg.train_ds))"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "OlOowK7rRAvs"
       },
+      "outputs": [],
       "source": [
         "# Setup train, validation, test configs\n",
         "with open_dict(cfg):\n",
@@ -1810,23 +2368,21 @@
         "  cfg.test_ds.pin_memory = True\n",
         "  cfg.test_ds.use_start_end_token = True\n",
         "  cfg.test_ds.trim_silence = True"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "y98ZAhBtRtoD"
       },
+      "outputs": [],
       "source": [
         "# setup model with new configs\n",
         "model.setup_training_data(cfg.train_ds)\n",
         "model.setup_multiple_validation_data(cfg.validation_ds)\n",
         "model.setup_multiple_test_data(cfg.test_ds)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1850,9 +2406,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "ozJDj6BktKw-"
       },
+      "outputs": [],
       "source": [
         "def analyse_ctc_failures_in_model(model):\n",
         "    count_ctc_failures = 0\n",
@@ -1889,52 +2447,52 @@
         "      model = model.train()\n",
         "\n",
         "    return count_ctc_failures, am_seq_lengths, target_seq_lengths"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "hJGUcq2BtKzw"
       },
+      "outputs": [],
       "source": [
         "results = analyse_ctc_failures_in_model(model)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "crEWxvI2tK2S"
       },
+      "outputs": [],
       "source": [
         "num_ctc_failures, am_seq_lengths, target_seq_lengths = results"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "L8M0-mSI1Jp5"
       },
+      "outputs": [],
       "source": [
         "if num_ctc_failures > 0:\n",
         "  logging.warning(f\"\\nCTC loss will fail for {num_ctc_failures} samples ({num_ctc_failures * 100./ float(len(am_seq_lengths))} % of samples)!\\n\"\n",
         "                  f\"Increase the vocabulary size of the tokenizer so that this number becomes close to zero !\")\n",
         "else:\n",
         "  logging.info(\"No CTC failure cases !\")"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "00wKre0W1Jsx"
       },
+      "outputs": [],
       "source": [
         "# Compute average ratio of T / U\n",
         "avg_T = sum(am_seq_lengths) / float(len(am_seq_lengths))\n",
@@ -1949,9 +2507,7 @@
         "print(f\"Average Target sequence length = {avg_U}\")\n",
         "print()\n",
         "print(f\"Ratio of Average AM sequence length to target sequence length = {avg_length_ratio}\")"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1966,14 +2522,14 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "sS-xoplxSTJv"
       },
+      "outputs": [],
       "source": [
         "print(OmegaConf.to_yaml(cfg.optim))"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1988,9 +2544,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "Io55nnbdXoeG"
       },
+      "outputs": [],
       "source": [
         "with open_dict(model.cfg.optim):\n",
         "  model.cfg.optim.lr = 0.025\n",
@@ -1998,9 +2556,7 @@
         "  model.cfg.optim.sched.warmup_steps = None  # Remove default number of steps of warmup\n",
         "  model.cfg.optim.sched.warmup_ratio = 0.10  # 10 % warmup\n",
         "  model.cfg.optim.sched.min_lr = 1e-9"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -2015,9 +2571,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "6Vb35_oRh_sV"
       },
+      "outputs": [],
       "source": [
         "with open_dict(model.cfg.spec_augment):\n",
         "  model.cfg.spec_augment.freq_masks = 2\n",
@@ -2026,9 +2584,7 @@
         "  model.cfg.spec_augment.time_width = 0.05\n",
         "\n",
         "model.spec_augmentation = model.from_config_dict(model.cfg.spec_augment)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -2043,30 +2599,30 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "cellView": "form",
         "id": "UfUlPXZS6vlV"
       },
+      "outputs": [],
       "source": [
         "#@title Metric\n",
         "use_cer = True #@param [\"False\", \"True\"] {type:\"raw\"}\n",
         "log_prediction = True #@param [\"False\", \"True\"] {type:\"raw\"}\n",
         "\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "6qpbMNZh68p9"
       },
+      "outputs": [],
       "source": [
         "model.wer.use_cer = use_cer\n",
         "model.wer.log_prediction = log_prediction"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -2083,9 +2639,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "bonpx5sRS07M"
       },
+      "outputs": [],
       "source": [
         "import torch\n",
         "import pytorch_lightning as ptl\n",
@@ -2111,15 +2669,15 @@
         "\n",
         "# finally, update the model's internal config\n",
         "model.cfg = model._cfg"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "SR4CiViFS8Ww"
       },
+      "outputs": [],
       "source": [
         "from nemo.utils import exp_manager\n",
         "\n",
@@ -2141,15 +2699,15 @@
         "config = OmegaConf.structured(config)\n",
         "\n",
         "logdir = exp_manager.exp_manager(trainer, config)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "OlvyYwYWTsl6"
       },
+      "outputs": [],
       "source": [
         "try:\n",
         "  from google import colab\n",
@@ -2163,21 +2721,19 @@
         "  %tensorboard --logdir /content/experiments/lang-$LANGUAGE/ASR-Model-Language-$LANGUAGE/\n",
         "else:\n",
         "  print(\"To use tensorboard, please use this notebook in a Google Colab environment.\")"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "6X21Q2qfVLvG"
       },
+      "outputs": [],
       "source": [
         "%%time\n",
         "trainer.fit(model)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -2192,16 +2748,16 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "DoWNVNYGOaMX"
       },
+      "outputs": [],
       "source": [
         "save_path = f\"Model-{LANGUAGE}.nemo\"\n",
         "model.save_to(f\"{save_path}\")\n",
         "print(f\"Model saved at path : {os.getcwd() + os.path.sep + save_path}\")"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -2216,5 +2772,29 @@
         "While the focus was on a small dataset for Japanese, nearly all of this information can be used for larger datasets and other scenarios where compute is limited, or the model's size prevents fine-tuning the entire model."
       ]
     }
-  ]
-}
\ No newline at end of file
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.10.12"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb b/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
index c9c547a8383e..ee5ae7392708 100644
--- a/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
+++ b/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
@@ -1,29 +1,8 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": [],
-      "collapsed_sections": [
-        "dm-qqTdZDUlZ",
-        "GGKgsW5gvAuf",
-        "0CqpJGR6ecYW"
-      ],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    },
-    "accelerator": "GPU"
-  },
   "cells": [
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 2,
       "metadata": {
         "id": "pEYsuj0J9pId"
       },
@@ -38,20 +17,25 @@
         "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
         "4. Run this cell to set up dependencies.\n",
         "5. Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect\n",
-        "\n\nNOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n",
+        "\n",
+        "\n",
+        "NOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n",
         "\"\"\"\n",
         "# If you're using Google Colab and not running locally, run this cell.\n",
         "import os\n",
+        "import sys\n",
         "\n",
         "# Install dependencies\n",
-        "!pip install wget\n",
-        "!apt-get install sox libsndfile1 ffmpeg\n",
-        "!pip install text-unidecode\n",
-        "!pip install matplotlib>=3.3.2\n",
+        "# !pip install wget\n",
+        "# !apt-get install sox libsndfile1 ffmpeg\n",
+        "# !pip install text-unidecode\n",
+        "# !pip install matplotlib>=3.3.2\n",
         "\n",
+        "sys.path.append(os.path.abspath('/workspace/nemo/NeMo-opensource/nemo/collections'))\n",
+        "sys.path.append(os.path.abspath('/workspace/nemo/NeMo-opensource/nemo'))\n",
         "## Install NeMo\n",
         "BRANCH = 'main'\n",
-        "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
+        "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
         "\n",
         "## Grab the config we'll use in this example\n",
         "# !mkdir configs\n",
@@ -60,6 +44,9 @@
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "cTV4WLrArmxS"
+      },
       "source": [
         "# ASR Domain Adaptation with Adapters\n",
         "\n",
@@ -70,13 +57,13 @@
         "-----\n",
         "\n",
         "In this tutorial, we will showcase **Adapters** : A powerful method to efficiently adapt a pre-trained model to a new dataset (with minimal amounts of data, even just 30 minutes !) with minimal compute resources (on a single GPU, in around 10 minutes of training time).\n"
-      ],
-      "metadata": {
-        "id": "cTV4WLrArmxS"
-      }
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "f-LSGyL4xw9c"
+      },
       "source": [
         "## What are Adapters?\n",
         "\n",
@@ -99,13 +86,13 @@
         "-----\n",
         "\n",
         "Adapter modules such as this are usually initialized. The initial output of the adapter will always be zeros to prevent degradation of the original model's performance due to the addition of such modules."
-      ],
-      "metadata": {
-        "id": "f-LSGyL4xw9c"
-      }
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "YGn1__-Jv2Bq"
+      },
       "source": [
         "## Advantages of Adapters\n",
         "\n",
@@ -116,13 +103,13 @@
         "- **Fast convergence**: Since the adapters only need to learn to modify the module's output slightly, and each adapter has a trivial parameter cost, they converge rapidly.\n",
         "- **Adapt only the encoder**: Adapters can be used anywhere, but they are most commonly used in just the encoder, keeping the decoder modules frozen. This allows the decoder to be unaffected by costly CTC/RNN-T training, which takes time to converge, and just the adapter modules in the encoder need to be updated.\n",
         "- **Dynamic and flexible adaptation**: Since adapter modules can be added any number of times, a single shared \"core\" model can have multiple adapters that are enabled/disabled dynamically to adapt to numerous scenarios. This potentially offers the case where a single \"core\" model is shared across multiple users, and each user has a small, personal adapter module used for personalization.  "
-      ],
-      "metadata": {
-        "id": "YGn1__-Jv2Bq"
-      }
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "8d7y1cygv4MP"
+      },
       "source": [
         "## Limitations of Adapters\n",
         "\n",
@@ -135,13 +122,13 @@
         "  - **Note**: There is nothing fundamentally wrong with still changing the vocabulary of a model that supports adapters. The benefits of adapters will reduce significantly and require costly training (similar in time and memory to finetuning). The model can no longer recover its performance by disabling all of its adapters.\n",
         "- **Easy to overfit**: Since adapters enable domain adaptation on very small amounts of speech data, it is trivial to rapidly overfit these datasets and significantly degrade performance on the original domain. \n",
         "  - **Note**: This can be overcome with some experimentation, further boosted by the fast experimentation cycle that adapters enable."
-      ],
-      "metadata": {
-        "id": "8d7y1cygv4MP"
-      }
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "mtYWTi0irkS6"
+      },
       "source": [
         "# Dataset preparation\n",
         "\n",
@@ -152,13 +139,15 @@
         "First, we prepare some datasets that the original model was **not trained on**, making it a new domain to be adapted. \n",
         "\n",
         "In this tutorial, we will be utilizing the `AN4` dataset - also known as the Alphanumeric dataset, which was collected and published by Carnegie Mellon University. We chose this dataset primarily because it is **very small in size** (`<1 hours of training data`), **easy to overfit when training from scratch / fine-tuning by changing the decoder** (`previous tutorials can mostly get around 10-20% WER with fine-tuning without hyperparameter tuning`), and its **text is perfectly supported by the tokenization/decoding scheme of the model**."
-      ],
-      "metadata": {
-        "id": "mtYWTi0irkS6"
-      }
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "id": "NpKgT6q5-gNk"
+      },
+      "outputs": [],
       "source": [
         "import os\n",
         "\n",
@@ -167,15 +156,37 @@
         "\n",
         "if not os.path.exists(\"scripts/process_an4_data.py\"):\n",
         "  !wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/dataset_processing/process_an4_data.py"
-      ],
-      "metadata": {
-        "id": "NpKgT6q5-gNk"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "id": "0wZZuUDi_gEV"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "******\n",
+            "Tarfile already exists.\n",
+            "Finished conversion.\n",
+            "******\n",
+            "Preparing AN4 dataset ...\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "/workspace/nemo/NeMo-opensource/tutorials/asr/asr_adapters/scripts/process_an4_data.py:43: FutureWarning: get_duration() keyword argument 'filename' has been renamed to 'path' in version 0.10.0.\n",
+            "\tThis alias will be removed in version 1.0.\n",
+            "  duration = librosa.core.get_duration(filename=audio_path)\n",
+            "\u001b[0mAN4 prepared !\n"
+          ]
+        }
+      ],
       "source": [
         "import wget\n",
         "import tarfile \n",
@@ -220,28 +231,26 @@
         "    --data_root=$an4_path\n",
         "\n",
         "print(\"AN4 prepared !\")"
-      ],
-      "metadata": {
-        "id": "0wZZuUDi_gEV"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "id": "9fiqQeWDAXsH"
+      },
+      "outputs": [],
       "source": [
         "# Manifest filepaths\n",
         "TRAIN_MANIFEST = os.path.join(data_dir, \"an4\", \"train_manifest.json\")\n",
         "TEST_MANIFEST = os.path.join(data_dir, \"an4\", \"test_manifest.json\")"
-      ],
-      "metadata": {
-        "id": "9fiqQeWDAXsH"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "q2nxi5RzAfZ5"
+      },
       "source": [
         "# Prepare the \"base\" model\n",
         "\n",
@@ -250,39 +259,50 @@
         "-----\n",
         "\n",
         "Most importantly, we discuss a simple way to enable Adapter specific support to a pre-trained model checkpoint - by modifying the `encoder` config before loading the model."
-      ],
-      "metadata": {
-        "id": "q2nxi5RzAfZ5"
-      }
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "id": "F-wt9y5iAali"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "[NeMo W 2024-07-04 11:54:42 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/torch/cuda/__init__.py:626: UserWarning: Can't initialize NVML\n",
+            "      warnings.warn(\"Can't initialize NVML\")\n",
+            "    \n"
+          ]
+        }
+      ],
       "source": [
         "import torch\n",
         "from omegaconf import OmegaConf, open_dict\n",
         "from pytorch_lightning import Trainer\n",
         "\n",
-        "import nemo.collections.asr as nemo_asr"
-      ],
-      "metadata": {
-        "id": "F-wt9y5iAali"
-      },
-      "execution_count": null,
-      "outputs": []
+        "# import nemo.collections.asr as nemo_asr\n",
+        "import asr as nemo_asr"
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "model_name = \"stt_en_conformer_ctc_small\""
-      ],
+      "execution_count": 7,
       "metadata": {
         "id": "uVOfU7gsCI5u"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "model_name = \"stt_en_conformer_ctc_small\""
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "TitUAeq67Hkl"
+      },
       "source": [
         "## Prepare an Adapter-compatible Encoder\n",
         "\n",
@@ -293,33 +313,45 @@
         "- Extract the model config from the \"base\" model.\n",
         "- Update the `encoder` section of the config to a subclass of that model (which does have Adapter support)\n",
         "- Initialize the model with this new config, therefore enabling adapter support."
-      ],
-      "metadata": {
-        "id": "TitUAeq67Hkl"
-      }
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "- Extract just the config of the model."
-      ],
       "metadata": {
         "id": "5V5UY-5c8FDv"
-      }
+      },
+      "source": [
+        "- Extract just the config of the model."
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "cfg = nemo_asr.models.ASRModel.from_pretrained(model_name, return_config=True)"
-      ],
+      "execution_count": 8,
       "metadata": {
         "id": "RzwLAHVqAqD9"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo I 2024-07-04 11:54:53 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.23.0/stt_en_conformer_ctc_small/5d2d8e5b2b5adb8f5091363c6ba19c55/stt_en_conformer_ctc_small.nemo.\n",
+            "[NeMo I 2024-07-04 11:54:53 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.23.0/stt_en_conformer_ctc_small/5d2d8e5b2b5adb8f5091363c6ba19c55/stt_en_conformer_ctc_small.nemo\n",
+            "[NeMo I 2024-07-04 11:54:53 common:924] Instantiating model from pre-trained checkpoint\n"
+          ]
+        }
+      ],
+      "source": [
+        "cfg = nemo_asr.models.ASRModel.from_pretrained(model_name, return_config=True)"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "id": "O6xAz38-A_Bh"
+      },
+      "outputs": [],
       "source": [
         "from nemo.core import adapter_mixins\n",
         "\n",
@@ -332,55 +364,148 @@
         "    \n",
         "    print(\"Updated encoder _target_ model :\", model_cfg.encoder._target_)\n",
         "    return model_cfg"
-      ],
-      "metadata": {
-        "id": "O6xAz38-A_Bh"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "- Update the model config's `encoder` section to support Adapters."
-      ],
       "metadata": {
         "id": "TDk2VMXI8OkG"
-      }
+      },
+      "source": [
+        "- Update the model config's `encoder` section to support Adapters."
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "cfg = update_model_config_to_support_adapter(cfg)"
-      ],
+      "execution_count": 10,
       "metadata": {
         "id": "iyp4xUOLBi0v"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Updated encoder _target_ model : nemo.collections.asr.modules.conformer_encoder.ConformerEncoderAdapter\n"
+          ]
+        }
+      ],
+      "source": [
+        "cfg = update_model_config_to_support_adapter(cfg)"
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "- Finally load the model with the updated config."
-      ],
       "metadata": {
         "id": "26NTK00w8VIt"
-      }
+      },
+      "source": [
+        "- Finally load the model with the updated config."
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "model = nemo_asr.models.ASRModel.from_pretrained(model_name, override_config_path=cfg)"
-      ],
+      "execution_count": 11,
       "metadata": {
         "id": "7r36mkUGBvsy"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo I 2024-07-04 11:54:54 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.23.0/stt_en_conformer_ctc_small/5d2d8e5b2b5adb8f5091363c6ba19c55/stt_en_conformer_ctc_small.nemo.\n",
+            "[NeMo I 2024-07-04 11:54:54 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.23.0/stt_en_conformer_ctc_small/5d2d8e5b2b5adb8f5091363c6ba19c55/stt_en_conformer_ctc_small.nemo\n",
+            "[NeMo I 2024-07-04 11:54:54 common:924] Instantiating model from pre-trained checkpoint\n",
+            "[NeMo I 2024-07-04 11:54:55 mixins:172] Tokenizer SentencePieceTokenizer initialized with 1024 tokens\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "[NeMo W 2024-07-04 11:54:56 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
+            "    Train config : \n",
+            "    manifest_filepath: /data/NeMo_ASR_SET/English/v2.0/train/tarred_audio_manifest.json\n",
+            "    sample_rate: 16000\n",
+            "    batch_size: 64\n",
+            "    shuffle: true\n",
+            "    num_workers: 8\n",
+            "    pin_memory: true\n",
+            "    use_start_end_token: false\n",
+            "    trim_silence: false\n",
+            "    max_duration: 20.0\n",
+            "    min_duration: 0.1\n",
+            "    shuffle_n: 2048\n",
+            "    is_tarred: true\n",
+            "    tarred_audio_filepaths: /data/NeMo_ASR_SET/English/v2.0/train/audio__OP_0..4095_CL_.tar\n",
+            "    \n",
+            "[NeMo W 2024-07-04 11:54:56 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
+            "    Validation config : \n",
+            "    manifest_filepath:\n",
+            "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-dev-other.json\n",
+            "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-dev-clean.json\n",
+            "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-test-other.json\n",
+            "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-test-clean.json\n",
+            "    sample_rate: 16000\n",
+            "    batch_size: 64\n",
+            "    shuffle: false\n",
+            "    num_workers: 8\n",
+            "    pin_memory: true\n",
+            "    use_start_end_token: false\n",
+            "    is_tarred: false\n",
+            "    tarred_audio_filepaths: na\n",
+            "    \n",
+            "[NeMo W 2024-07-04 11:54:56 modelPT:178] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a valid configuration file to setup the test data loader(s).\n",
+            "    Test config : \n",
+            "    manifest_filepath:\n",
+            "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-test-other.json\n",
+            "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-dev-clean.json\n",
+            "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-dev-other.json\n",
+            "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-test-clean.json\n",
+            "    sample_rate: 16000\n",
+            "    batch_size: 64\n",
+            "    shuffle: false\n",
+            "    num_workers: 8\n",
+            "    pin_memory: true\n",
+            "    use_start_end_token: false\n",
+            "    is_tarred: false\n",
+            "    tarred_audio_filepaths: na\n",
+            "    \n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo I 2024-07-04 11:54:56 features:289] PADDING: 0\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "WARNING: Logging before flag parsing goes to stderr.\n",
+            "E0704 11:54:56.411043 124023995053888 driver.py:396] Call to cuInit results in CUDA_ERROR_NO_DEVICE\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo I 2024-07-04 11:54:56 save_restore_connector:249] Model EncDecCTCModelBPE was successfully restored from /root/.cache/torch/NeMo/NeMo_1.23.0/stt_en_conformer_ctc_small/5d2d8e5b2b5adb8f5091363c6ba19c55/stt_en_conformer_ctc_small.nemo.\n"
+          ]
+        }
+      ],
+      "source": [
+        "model = nemo_asr.models.ASRModel.from_pretrained(model_name, override_config_path=cfg)"
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "x0C2r7388cRd"
+      },
       "source": [
         "-----\n",
         "\n",
@@ -391,13 +516,25 @@
         "**Recommendation**:\n",
         "\n",
         "You should normally start with 1-5 epochs of adaptation over your entire new domain, and then increase or decrease your number of training steps to trade off a balance in accuracy on general speech."
-      ],
-      "metadata": {
-        "id": "x0C2r7388cRd"
-      }
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {
+        "id": "sWRUXzjQMWN5"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "GPU available: False, used: False\n",
+            "TPU available: False, using: 0 TPU cores\n",
+            "HPU available: False, using: 0 HPUs\n"
+          ]
+        }
+      ],
       "source": [
         "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n",
         "max_steps = 300\n",
@@ -407,28 +544,26 @@
         "                  log_every_n_steps=5, check_val_every_n_epoch=3)\n",
         "\n",
         "model.set_trainer(trainer)"
-      ],
-      "metadata": {
-        "id": "sWRUXzjQMWN5"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {
+        "id": "tJBriqr3tQV7"
+      },
+      "outputs": [],
       "source": [
         "# utility method\n",
         "import json\n",
         "from nemo.collections.asr.parts.utils.manifest_utils import read_manifest\n"
-      ],
-      "metadata": {
-        "id": "tJBriqr3tQV7"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "dm-qqTdZDUlZ"
+      },
       "source": [
         "## [Optional] Check if the new domain is compatible with the original decoder\n",
         "\n",
@@ -437,22 +572,32 @@
         "-----\n",
         "\n",
         "If this check fails, the training run might crash, or silently allow the model to learn to produce `⁇` tokens (when using SentencePiece tokenizers)."
-      ],
-      "metadata": {
-        "id": "dm-qqTdZDUlZ"
-      }
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "### Parse the base character set"
-      ],
       "metadata": {
         "id": "UKTiAPV_sdFI"
-      }
+      },
+      "source": [
+        "### Parse the base character set"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 14,
+      "metadata": {
+        "id": "WgogR3taD7NA"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Base charset : [' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']\n"
+          ]
+        }
+      ],
       "source": [
         "train_data = read_manifest(TRAIN_MANIFEST)\n",
         "base_sets = [set(list(sample['text'])) for sample in train_data]\n",
@@ -462,24 +607,24 @@
         "base_charset = list(sorted(list(base_charset)))\n",
         "\n",
         "print(\"Base charset :\", base_charset)"
-      ],
-      "metadata": {
-        "id": "WgogR3taD7NA"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "### Check if there are invalid characters"
-      ],
       "metadata": {
         "id": "x-0fzrfPshJj"
-      }
+      },
+      "source": [
+        "### Check if there are invalid characters"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 16,
+      "metadata": {
+        "id": "5laUkRf5Eb6l"
+      },
+      "outputs": [],
       "source": [
         "def check_valid_charset_in_vocab(model, charset):\n",
         "  model_vocab = model.decoder.vocabulary\n",
@@ -491,75 +636,112 @@
         "      num_invalid += 1\n",
         "\n",
         "  print(\"Number of invalid tokens :\", num_invalid)"
-      ],
-      "metadata": {
-        "id": "5laUkRf5Eb6l"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "check_valid_charset_in_vocab(model, base_charset)"
-      ],
+      "execution_count": 17,
       "metadata": {
         "id": "5rEUqs7AFh5j"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Number of invalid tokens : 0\n"
+          ]
+        }
+      ],
+      "source": [
+        "check_valid_charset_in_vocab(model, base_charset)"
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "Sf-2EHznGkI1"
+      },
       "source": [
         "# Evaluate original performance on AN4 dev set\n",
         "\n",
         "Now that we possess a model capable of supporting adapters, let us quickly test the performance of the pre-trained model on the AN4 test set without any training or fine-tuning."
-      ],
-      "metadata": {
-        "id": "Sf-2EHznGkI1"
-      }
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "if not os.path.exists('scripts/transcribe_speech.py'):\n",
+      "execution_count": 18,
+      "metadata": {
+        "id": "Ak4v4aWjGoQH"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "--2024-07-04 11:20:18--  https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/asr/transcribe_speech.py\n",
+            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...\n",
+            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n",
+            "HTTP request sent, awaiting response... 200 OK\n",
+            "Length: 23427 (23K) [text/plain]\n",
+            "Saving to: ‘scripts/transcribe_speech.py’\n",
+            "\n",
+            "transcribe_speech.p 100%[===================>]  22.88K  --.-KB/s    in 0s      \n",
+            "\n",
+            "2024-07-04 11:20:18 (113 MB/s) - ‘scripts/transcribe_speech.py’ saved [23427/23427]\n",
+            "\n",
+            "--2024-07-04 11:20:19--  https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/asr/speech_to_text_eval.py\n",
+            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...\n",
+            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
+            "HTTP request sent, awaiting response... 200 OK\n",
+            "Length: 8539 (8.3K) [text/plain]\n",
+            "Saving to: ‘scripts/speech_to_text_eval.py’\n",
+            "\n",
+            "speech_to_text_eval 100%[===================>]   8.34K  --.-KB/s    in 0s      \n",
+            "\n",
+            "2024-07-04 11:20:19 (82.1 MB/s) - ‘scripts/speech_to_text_eval.py’ saved [8539/8539]\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "if not os.path.exists('scripts/transcribe_speech.py'):\n",
         "  !wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/transcribe_speech.py\n",
         "\n",
         "if not os.path.exists('scripts/speech_to_text_eval.py'):\n",
         "  !wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/speech_to_text_eval.py"
-      ],
-      "metadata": {
-        "id": "Ak4v4aWjGoQH"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "# temporarily save current model\n",
-        "model.save_to(\"/content/unadapted_model.nemo\")"
-      ],
+      "execution_count": null,
       "metadata": {
         "id": "OVlBKWCiIHw7"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "# temporarily save current model\n",
+        "model.save_to(\"/content/unadapted_model.nemo\")"
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "r03iDw9k-dAm"
+      },
       "source": [
         "-----\n",
         "\n",
         "The following evaluation script will properly transcribe the AN4 test set, and score it against its ground truth."
-      ],
-      "metadata": {
-        "id": "r03iDw9k-dAm"
-      }
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "C6YbPt70H0-N"
+      },
+      "outputs": [],
       "source": [
         "!python scripts/speech_to_text_eval.py \\\n",
         "  model_path=\"/content/unadapted_model.nemo\" \\\n",
@@ -567,50 +749,48 @@
         "  output_filename=\"/content/unadapted_predictions.json\" \\\n",
         "  batch_size=32 \\\n",
         "  use_cer=False"
-      ],
-      "metadata": {
-        "id": "C6YbPt70H0-N"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "2VBQO3w3swu8"
+      },
       "source": [
         "------\n",
         "\n",
         "Check the predictions of the current model"
-      ],
-      "metadata": {
-        "id": "2VBQO3w3swu8"
-      }
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "!head -n 5 /content/unadapted_predictions.json"
-      ],
+      "execution_count": null,
       "metadata": {
         "id": "SE8uoRLsJA9F"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "!head -n 5 /content/unadapted_predictions.json"
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "muRBgHHe-n7E"
+      },
       "source": [
         "-----\n",
         "\n",
         "Overall, the model does quite well, obtaining roughly 6% Word Error Rate without prior training on this dataset. \n",
         "\n",
         "**Note**: Pre-trained models in NeMo are trained on several thousands of hours of speech, so it is unsurprising why this model is this accurate without any training on this toy dataset. For more realistic cases, we usually observe the range of 10-30% WER for out-of-domain speech."
-      ],
-      "metadata": {
-        "id": "muRBgHHe-n7E"
-      }
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "b-L3prIzs3CW"
+      },
       "source": [
         "# Setup training and evaluation of the model\n",
         "\n",
@@ -621,22 +801,38 @@
         "**Note**: Each model may have special parameters in their data loader. Please refer to the configs of the pre-trained models to determine what additional changes are necessary). Below recommendations are primarily for Conformer CTC and may differ from model to model.\n",
         "\n",
         "You can parse the model config via - `print(OmegaConf.to_yaml(model.cfg))`"
-      ],
-      "metadata": {
-        "id": "b-L3prIzs3CW"
-      }
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "## Setup dataloaders"
-      ],
       "metadata": {
         "id": "V2WirN5KJpsD"
-      }
+      },
+      "source": [
+        "## Setup dataloaders"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 18,
+      "metadata": {
+        "id": "F0GIxhyCJmFv"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Value(False)\n",
+            "[NeMo I 2024-07-04 12:07:47 collections:196] Dataset loaded with 948 files totalling 0.71 hours\n",
+            "[NeMo I 2024-07-04 12:07:47 collections:197] 0 files were filtered totalling 0.00 hours\n",
+            "[NeMo I 2024-07-04 16:53:51 collections:196] Dataset loaded with 130 files totalling 0.10 hours\n",
+            "[NeMo I 2024-07-04 16:53:51 collections:197] 0 files were filtered totalling 0.00 hours\n",
+            "[NeMo I 2024-07-04 16:53:52 collections:196] Dataset loaded with 130 files totalling 0.10 hours\n",
+            "[NeMo I 2024-07-04 16:53:52 collections:197] 0 files were filtered totalling 0.00 hours\n"
+          ]
+        }
+      ],
       "source": [
         "with open_dict(model.cfg):\n",
         "  # Train Dataloader\n",
@@ -651,28 +847,28 @@
         "model.setup_training_data(model.cfg.train_ds)\n",
         "model.setup_multiple_validation_data(model.cfg.validation_ds)\n",
         "model.setup_multiple_test_data(model.cfg.validation_ds)"
-      ],
-      "metadata": {
-        "id": "F0GIxhyCJmFv"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "T3VuqcGTNuIJ"
+      },
       "source": [
         "## Setup Spectrogram Augmentation\n",
         "\n",
         "For this experiment we will continue to use the original spec augmentation config in the base model, however you may find better results by modifying the strength of this augmentation.\n",
         "\n",
         "**Note**: The script inside ASR examples **disables spec augment entirely**. This is done in order to provide a stable default to measure the best possible adaptation case, but may severely degrade the performance on general speech. Please be careful when copying the hyper parameters from the tutorial to the script for large scale experimentation."
-      ],
-      "metadata": {
-        "id": "T3VuqcGTNuIJ"
-      }
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 20,
+      "metadata": {
+        "id": "T-XFuaA3OlOB"
+      },
+      "outputs": [],
       "source": [
         "with open_dict(model.cfg):\n",
         "  # Spec Augment\n",
@@ -682,15 +878,13 @@
         "  model.cfg.spec_augment.time_width = model.cfg.spec_augment.time_width  # Can be changed\n",
         "\n",
         "model.spec_augmentation = model.from_config_dict(model.cfg.spec_augment)"
-      ],
-      "metadata": {
-        "id": "T-XFuaA3OlOB"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "xGpdUWl_tGuA"
+      },
       "source": [
         "## Setup optimizer and scheduler\n",
         "\n",
@@ -701,25 +895,76 @@
         "Feel free to modify these values to see the effect on adapters' convergence.\n",
         "\n",
         "**Note**: The hyper parameters below correspond to the base model and may not match those applied in the ASR examples! Please note that the script the examples defaults to an **AdamW** optimizer with a **CosineAnnealing** scheduler, where as the config of Conformers is geneally a **AdamW** optimizer with a **NoamAnnealing** scheduler. The *learning rate*, *weight decay* and other hyper parameters may not be exactly the same between the tutorial and the example scripts, so please be careful when transferring the hyper parameters for large scale experiments."
-      ],
-      "metadata": {
-        "id": "xGpdUWl_tGuA"
-      }
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "if 'optim' in model.cfg:\n",
-        "  print(OmegaConf.to_yaml(model.cfg.optim))"
-      ],
+      "execution_count": 21,
       "metadata": {
         "id": "UDEIfMTcP6j6"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "name: adamw\n",
+            "lr: 2.0\n",
+            "betas:\n",
+            "- 0.9\n",
+            "- 0.98\n",
+            "weight_decay: 0\n",
+            "sched:\n",
+            "  name: NoamAnnealing\n",
+            "  d_model: 176\n",
+            "  warmup_steps: 10000\n",
+            "  warmup_ratio: null\n",
+            "  min_lr: 1.0e-06\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "if 'optim' in model.cfg:\n",
+        "  print(OmegaConf.to_yaml(model.cfg.optim))"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 22,
+      "metadata": {
+        "id": "tp_8FGPcKjMd"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo I 2024-07-04 19:47:39 modelPT:723] Optimizer config = AdamW (\n",
+            "    Parameter Group 0\n",
+            "        amsgrad: False\n",
+            "        betas: [0.9, 0.98]\n",
+            "        capturable: False\n",
+            "        differentiable: False\n",
+            "        eps: 1e-08\n",
+            "        foreach: None\n",
+            "        fused: None\n",
+            "        lr: 0.1\n",
+            "        maximize: False\n",
+            "        weight_decay: 0.0\n",
+            "    )\n",
+            "[NeMo I 2024-07-04 19:47:39 lr_scheduler:915] Scheduler \"<nemo.core.optim.lr_scheduler.NoamAnnealing object at 0x70cad0ef5a80>\" \n",
+            "    will be used during training (effective maximum steps = 300) - \n",
+            "    Parameters : \n",
+            "    (d_model: 176\n",
+            "    warmup_steps: 100\n",
+            "    warmup_ratio: null\n",
+            "    min_lr: 1.0e-06\n",
+            "    max_steps: 300\n",
+            "    )\n"
+          ]
+        }
+      ],
       "source": [
         "with open_dict(model.cfg):\n",
         "  model.cfg.optim.lr = 0.1\n",
@@ -727,62 +972,85 @@
         "  model.cfg.optim.sched.warmup_steps = 100\n",
         "\n",
         "model.setup_optimization(model.cfg.optim);"
-      ],
-      "metadata": {
-        "id": "tp_8FGPcKjMd"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "AGrThAt9Qh0D"
+      },
       "source": [
         "# Adapters: Supported Components\n",
         "\n",
         "A NeMo model may have multiple types of adapters that are supported in each of their components. Let us see at a glance what are some of the adapter types supported by the Conformer ASR model.\n",
         "\n",
         "**Note**: Every domain may support their own types of adapters, and use them in different ways. Please refer to the documentation of each domain for information on the adapter support."
-      ],
-      "metadata": {
-        "id": "AGrThAt9Qh0D"
-      }
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "Wq1JLbNvROcL"
+      },
       "source": [
         "-----\n",
         "Let's start with the modules in which the model will support adapters. We can select these adapters with a special syntax to construct \"Module adapters\".\n",
         "\n",
         "**Note**: `''` refers to the \"default\" adapter - usually the `encoder` but it is model dependent. It may also be that no specific modules are provided, in which case only `default` adapters will be available."
-      ],
-      "metadata": {
-        "id": "Wq1JLbNvROcL"
-      }
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "if hasattr(model, 'adapter_module_names'):\n",
-        "  print(model.adapter_module_names)"
-      ],
+      "execution_count": 23,
       "metadata": {
         "id": "fRIDhU8RVBwi"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "['', 'encoder', 'decoder', 'joint']\n"
+          ]
+        }
+      ],
+      "source": [
+        "if hasattr(model, 'adapter_module_names'):\n",
+        "  print(model.adapter_module_names)"
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "u5BOWWBjfQwN"
+      },
       "source": [
         "-----\n",
         "Next, we can try to obtain the accepted types of each of the child modules in the Model."
-      ],
-      "metadata": {
-        "id": "u5BOWWBjfQwN"
-      }
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 24,
+      "metadata": {
+        "id": "iNnSp_azQ2u8"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Module :  ConformerEncoderAdapter\n",
+            "<class 'nemo.collections.common.parts.adapter_modules.LinearAdapter'>\n",
+            "<class 'nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.RelPositionMultiHeadAttentionAdapter'>\n",
+            "<class 'nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.MultiHeadAttentionAdapter'>\n",
+            "\n",
+            "Module :  ConvASRDecoder\n",
+            "<class 'nemo.collections.common.parts.adapter_modules.LinearAdapter'>\n",
+            "\n"
+          ]
+        }
+      ],
       "source": [
         "for module in model.children():\n",
         "  if hasattr(module, 'get_accepted_adapter_types'):\n",
@@ -792,26 +1060,24 @@
         "    for tp in types:\n",
         "      print(tp)\n",
         "    print()"
-      ],
-      "metadata": {
-        "id": "iNnSp_azQ2u8"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "YXTC4LiSnB2O"
+      },
       "source": [
         "-----\n",
         "\n",
         "As you can see, a single component of the model may support one or more adapter types (or none at all)! Below, we will experiment with the simple Linear Adapters, but as an exercise, you might try to use other adapter types present here."
-      ],
-      "metadata": {
-        "id": "YXTC4LiSnB2O"
-      }
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "WFCUrYxnGPt3"
+      },
       "source": [
         "# Adapters: Creation and Preparation\n",
         "\n",
@@ -822,24 +1088,26 @@
         "We first import a config for a basic `LinearAdapter` most often used in literature. \n",
         "\n",
         "`LinearAdapter` is a simple network comprising LayerNorm, a bottleneck Linear layer, an activation, and an upcast Linear layer (so that input and output channel dim match). We provide some configuration parameters (such as the input dim and the bottleneck dim)."
-      ],
-      "metadata": {
-        "id": "WFCUrYxnGPt3"
-      }
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "from nemo.collections.common.parts.adapter_modules import LinearAdapterConfig"
-      ],
+      "execution_count": 25,
       "metadata": {
         "id": "oZZr6vSntuyX"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "from nemo.collections.common.parts.adapter_modules import LinearAdapterConfig"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 26,
+      "metadata": {
+        "id": "dlj0Yud4MxOi"
+      },
+      "outputs": [],
       "source": [
         "#%% [code]\n",
         "#@title Adapter Setup { display-mode: \"form\" }\n",
@@ -847,15 +1115,23 @@
         "adapter_dim = 32 #@param {type:\"integer\"}\n",
         "adapter_activation = \"swish\" #@param {type:\"string\"}\n",
         "adapter_norm_position = \"pre\" #@param [\"pre\", \"post\"]"
-      ],
-      "metadata": {
-        "id": "dlj0Yud4MxOi"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 27,
+      "metadata": {
+        "id": "Uv8WRQkXU3mu"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "LinearAdapterConfig(in_features=176, dim=32, activation='swish', norm_position='pre', dropout=0.0, adapter_strategy=ResidualAddAdapterStrategyConfig(stochastic_depth=0.0, l2_lambda=0.0, _target_='nemo.core.classes.mixins.adapter_mixin_strategies.ResidualAddAdapterStrategy'), _target_='nemo.collections.common.parts.adapter_modules.LinearAdapter')\n"
+          ]
+        }
+      ],
       "source": [
         "adapter_cfg = LinearAdapterConfig(\n",
         "    in_features=model.cfg.encoder.d_model,  # conformer specific model dim. Every layer emits this dim at its output.\n",
@@ -864,83 +1140,127 @@
         "    norm_position=adapter_norm_position,  # whether to use LayerNorm at the beginning or the end of the adapter\n",
         ")\n",
         "print(adapter_cfg)"
-      ],
-      "metadata": {
-        "id": "Uv8WRQkXU3mu"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "pIECyKxit58r"
+      },
       "source": [
         "## Add a new adapter module\n",
         "\n",
         "Now that our adapter config is ready. Next, we perform a check to see what is the size of the original model and what its size will be after adding the adapter module."
-      ],
-      "metadata": {
-        "id": "pIECyKxit58r"
-      }
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "model.summarize()"
-      ],
+      "execution_count": 28,
       "metadata": {
         "id": "-MbSTbYiYtnB"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "  | Name              | Type                              | Params | Mode \n",
+              "--------------------------------------------------------------------------------\n",
+              "0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0      | train\n",
+              "1 | encoder           | ConformerEncoderAdapter           | 13.0 M | train\n",
+              "2 | decoder           | ConvASRDecoder                    | 181 K  | train\n",
+              "3 | loss              | CTCLoss                           | 0      | train\n",
+              "4 | spec_augmentation | SpectrogramAugmentation           | 0      | train\n",
+              "5 | wer               | WER                               | 0      | train\n",
+              "--------------------------------------------------------------------------------\n",
+              "13.2 M    Trainable params\n",
+              "0         Non-trainable params\n",
+              "13.2 M    Total params\n",
+              "52.616    Total estimated model params size (MB)"
+            ]
+          },
+          "execution_count": 28,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "model.summarize()"
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "vjYmPbwCC0LZ"
+      },
       "source": [
         "-----\n",
         "\n",
         "Next, we use `add_adapter` to add adapter blocks to the `encoder`.\n",
         "\n",
         "A single line can be used to add adapter modules to every layer of the `encoder` module. We pass it a unique name to identify this adapter and the adapter config (which can be helpful to enable or disable adapters later)."
-      ],
-      "metadata": {
-        "id": "vjYmPbwCC0LZ"
-      }
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "model.add_adapter(name=adapter_name, cfg=adapter_cfg)"
-      ],
+      "execution_count": 29,
       "metadata": {
         "id": "El6ewd1GX9V7"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "model.add_adapter(name=adapter_name, cfg=adapter_cfg)"
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "jMsmj1W-DTSd"
+      },
       "source": [
         "-----\n",
         "\n",
         "As expected, the number of parameters increased by a marginal amount (roughly 200,000 parameters)."
-      ],
-      "metadata": {
-        "id": "jMsmj1W-DTSd"
-      }
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "model.summarize()"
-      ],
+      "execution_count": 30,
       "metadata": {
         "id": "rIvw0_8iYpHW"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "  | Name              | Type                              | Params | Mode \n",
+              "--------------------------------------------------------------------------------\n",
+              "0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0      | train\n",
+              "1 | encoder           | ConformerEncoderAdapter           | 13.2 M | train\n",
+              "2 | decoder           | ConvASRDecoder                    | 181 K  | train\n",
+              "3 | loss              | CTCLoss                           | 0      | train\n",
+              "4 | spec_augmentation | SpectrogramAugmentation           | 0      | train\n",
+              "5 | wer               | WER                               | 0      | train\n",
+              "--------------------------------------------------------------------------------\n",
+              "13.3 M    Trainable params\n",
+              "0         Non-trainable params\n",
+              "13.3 M    Total params\n",
+              "53.360    Total estimated model params size (MB)"
+            ]
+          },
+          "execution_count": 30,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "model.summarize()"
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "RH6cXPW2ZHdZ"
+      },
       "source": [
         "## Enable / Disable Adapters\n",
         "\n",
@@ -949,25 +1269,34 @@
         "For this purpose, we utilize the `model.set_enabled_adapters` method - it takes an optional `name` and a boolean value for `enabled`. If a name is not passed, it will set enable/disable all available adapters.\n",
         "\n",
         "**Note**: We recommend training one adapter at a time, disjoint from all other adapters. As such, it simplifies the selection of adapters for each particular domain. To do so - **disable all adapters first, then enable only the newly added adapter**."
-      ],
-      "metadata": {
-        "id": "RH6cXPW2ZHdZ"
-      }
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "model.set_enabled_adapters(enabled=False)  # disable all adapters\n",
-        "model.set_enabled_adapters(name=adapter_name, enabled=True)  # enable only the current adapter we want to train"
-      ],
+      "execution_count": 31,
       "metadata": {
         "id": "ogUfDkjdZKHu"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo I 2024-07-04 19:48:18 adapter_mixins:719] Setting adapter 'AN4' status : Enabled = False\n",
+            "[NeMo I 2024-07-04 19:48:18 adapter_mixins:734] Setting adapter 'AN4' status : Enabled = True\n"
+          ]
+        }
+      ],
+      "source": [
+        "model.set_enabled_adapters(enabled=False)  # disable all adapters\n",
+        "model.set_enabled_adapters(name=adapter_name, enabled=True)  # enable only the current adapter we want to train"
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "V87SBzdDY1x1"
+      },
       "source": [
         "## Training only the adapter(s)\n",
         "\n",
@@ -976,25 +1305,49 @@
         "We provide the general utility methods for this purpose - `model.freeze()` and `model.unfreeze_enabled_adapters()`. \n",
         "\n",
         "The second method will look up all the enabled adapters selected in the previous step and enable their gradient calculation so that they can be trained."
-      ],
-      "metadata": {
-        "id": "V87SBzdDY1x1"
-      }
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "model.freeze()\n",
-        "model.unfreeze_enabled_adapters()"
-      ],
+      "execution_count": 44,
       "metadata": {
         "id": "RN2YayAoYzaI"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.0.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.1.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.2.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.3.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.4.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.5.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.6.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.7.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.8.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.9.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.10.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.11.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.12.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.13.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.14.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.15.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:435] Unfrozen adapter : AN4\n"
+          ]
+        }
+      ],
+      "source": [
+        "model.freeze()\n",
+        "model.unfreeze_enabled_adapters()"
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "5PriDOuwEbmp"
+      },
       "source": [
         "### Why are BatchNormalization layers being frozen?\n",
         "\n",
@@ -1006,35 +1359,83 @@
         "\n",
         "For this reason, `unfreeze_enabled_adapters()` has an argument `freeze_batchnorm=True` as the default. It will find all the batch normalization layers and disable this flag so that it will the encoder layers remain exactly frozen even during adapter finetuning. This allows the original model performance to be recovered.\n",
         "\n"
-      ],
-      "metadata": {
-        "id": "5PriDOuwEbmp"
-      }
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "model.summarize()"
-      ],
+      "execution_count": 45,
       "metadata": {
         "id": "Lf3pdwQ2Zch5"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "  | Name              | Type                              | Params | Mode\n",
+              "-------------------------------------------------------------------------------\n",
+              "0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0      | eval\n",
+              "1 | encoder           | ConformerEncoderAdapter           | 13.2 M | eval\n",
+              "2 | decoder           | ConvASRDecoder                    | 181 K  | eval\n",
+              "3 | loss              | CTCLoss                           | 0      | eval\n",
+              "4 | spec_augmentation | SpectrogramAugmentation           | 0      | eval\n",
+              "5 | wer               | WER                               | 0      | eval\n",
+              "-------------------------------------------------------------------------------\n",
+              "185 K     Trainable params\n",
+              "13.2 M    Non-trainable params\n",
+              "13.3 M    Total params\n",
+              "53.360    Total estimated model params size (MB)"
+            ]
+          },
+          "execution_count": 45,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "model.summarize()"
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "JI6C_TYGGgyZ"
+      },
       "source": [
         "-----\n",
         "\n",
         "Here we see that after the above steps, we will be training just ~ 200,000 parameters out of a 10+ M parameter model."
-      ],
-      "metadata": {
-        "id": "JI6C_TYGGgyZ"
-      }
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 46,
+      "metadata": {
+        "id": "w9ciIw-2bSHq"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo I 2024-07-04 21:32:25 exp_manager:396] Experiments will be logged at experiments/ASR-Adapters/2024-07-04_21-32-25\n",
+            "[NeMo I 2024-07-04 21:32:25 exp_manager:842] TensorboardLogger has been set up\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "[NeMo W 2024-07-04 21:32:25 exp_manager:952] The checkpoint callback was told to monitor a validation value and trainer's max_steps was set to 300. Please ensure that max_steps will run for at least 3 epochs to ensure that checkpointing will not error out.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo I 2024-07-04 21:32:25 exp_manager:971] Preemption is supported only on GPUs, disabling preemption\n"
+          ]
+        }
+      ],
       "source": [
         "# Prepare NeMo's Experiment manager to handle checkpoint saving and logging for us\n",
         "from nemo.utils import exp_manager\n",
@@ -1057,80 +1458,233 @@
         "exp_config = OmegaConf.structured(exp_config)\n",
         "\n",
         "logdir = exp_manager.exp_manager(trainer, exp_config)"
-      ],
-      "metadata": {
-        "id": "w9ciIw-2bSHq"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "# Finally, train the adapters\n",
-        "trainer.fit(model)"
-      ],
+      "execution_count": 47,
       "metadata": {
         "id": "cY2TJod3ZfyE"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo I 2024-07-04 21:32:36 modelPT:723] Optimizer config = AdamW (\n",
+            "    Parameter Group 0\n",
+            "        amsgrad: False\n",
+            "        betas: [0.9, 0.98]\n",
+            "        capturable: False\n",
+            "        differentiable: False\n",
+            "        eps: 1e-08\n",
+            "        foreach: None\n",
+            "        fused: None\n",
+            "        lr: 0.1\n",
+            "        maximize: False\n",
+            "        weight_decay: 0.0\n",
+            "    )\n",
+            "[NeMo I 2024-07-04 21:32:36 lr_scheduler:915] Scheduler \"<nemo.core.optim.lr_scheduler.NoamAnnealing object at 0x70cac068d9c0>\" \n",
+            "    will be used during training (effective maximum steps = 300) - \n",
+            "    Parameters : \n",
+            "    (d_model: 176\n",
+            "    warmup_steps: 100\n",
+            "    warmup_ratio: null\n",
+            "    min_lr: 1.0e-06\n",
+            "    max_steps: 300\n",
+            "    )\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "  | Name              | Type                              | Params | Mode\n",
+            "-------------------------------------------------------------------------------\n",
+            "0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0      | eval\n",
+            "1 | encoder           | ConformerEncoderAdapter           | 13.2 M | eval\n",
+            "2 | decoder           | ConvASRDecoder                    | 181 K  | eval\n",
+            "3 | loss              | CTCLoss                           | 0      | eval\n",
+            "4 | spec_augmentation | SpectrogramAugmentation           | 0      | eval\n",
+            "5 | wer               | WER                               | 0      | eval\n",
+            "-------------------------------------------------------------------------------\n",
+            "185 K     Trainable params\n",
+            "13.2 M    Non-trainable params\n",
+            "13.3 M    Total params\n",
+            "53.360    Total estimated model params size (MB)\n"
+          ]
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "0d39171be96b430d8599e806ecaedd7a",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Sanity Checking: |          | 0/? [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo I 2024-07-04 21:32:38 wer:318] \n",
+            "    \n",
+            "[NeMo I 2024-07-04 21:32:38 wer:319] reference:rubout g m e f three nine\n",
+            "[NeMo I 2024-07-04 21:32:38 wer:320] predicted:rabutt g m e f three nine\n",
+            "[NeMo I 2024-07-04 21:32:39 wer:318] \n",
+            "    \n",
+            "[NeMo I 2024-07-04 21:32:39 wer:319] reference:v a n e s s a\n",
+            "[NeMo I 2024-07-04 21:32:39 wer:320] predicted:v a n e s s a\n"
+          ]
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "04f304daeae64bd5a67b5443f907c14f",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Training: |          | 0/? [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).\n",
+            "\u0000ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).\n",
+            "\u0000"
+          ]
+        },
+        {
+          "ename": "RuntimeError",
+          "evalue": "DataLoader worker (pid 2281096) is killed by signal: Bus error. It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit.",
+          "output_type": "error",
+          "traceback": [
+            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+            "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+            "Cell \u001b[0;32mIn[47], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# Finally, train the adapters\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m)\u001b[49m\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py:543\u001b[0m, in \u001b[0;36mTrainer.fit\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m    541\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mstatus \u001b[38;5;241m=\u001b[39m TrainerStatus\u001b[38;5;241m.\u001b[39mRUNNING\n\u001b[1;32m    542\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 543\u001b[0m \u001b[43mcall\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_and_handle_interrupt\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    544\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_impl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mval_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdatamodule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\n\u001b[1;32m    545\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py:44\u001b[0m, in \u001b[0;36m_call_and_handle_interrupt\u001b[0;34m(trainer, trainer_fn, *args, **kwargs)\u001b[0m\n\u001b[1;32m     42\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mlauncher \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     43\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mlauncher\u001b[38;5;241m.\u001b[39mlaunch(trainer_fn, \u001b[38;5;241m*\u001b[39margs, trainer\u001b[38;5;241m=\u001b[39mtrainer, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m---> 44\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtrainer_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     46\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m _TunerExitException:\n\u001b[1;32m     47\u001b[0m     _call_teardown_hook(trainer)\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py:579\u001b[0m, in \u001b[0;36mTrainer._fit_impl\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m    572\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mfn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    573\u001b[0m ckpt_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_checkpoint_connector\u001b[38;5;241m.\u001b[39m_select_ckpt_path(\n\u001b[1;32m    574\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mfn,\n\u001b[1;32m    575\u001b[0m     ckpt_path,\n\u001b[1;32m    576\u001b[0m     model_provided\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m    577\u001b[0m     model_connected\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlightning_module \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    578\u001b[0m )\n\u001b[0;32m--> 579\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mckpt_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    581\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mstopped\n\u001b[1;32m    582\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py:986\u001b[0m, in \u001b[0;36mTrainer._run\u001b[0;34m(self, model, ckpt_path)\u001b[0m\n\u001b[1;32m    981\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_signal_connector\u001b[38;5;241m.\u001b[39mregister_signal_handlers()\n\u001b[1;32m    983\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[1;32m    984\u001b[0m \u001b[38;5;66;03m# RUN THE TRAINER\u001b[39;00m\n\u001b[1;32m    985\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[0;32m--> 986\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_stage\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    988\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[1;32m    989\u001b[0m \u001b[38;5;66;03m# POST-Training CLEAN UP\u001b[39;00m\n\u001b[1;32m    990\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[1;32m    991\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: trainer tearing down\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py:1030\u001b[0m, in \u001b[0;36mTrainer._run_stage\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1028\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_run_sanity_check()\n\u001b[1;32m   1029\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mautograd\u001b[38;5;241m.\u001b[39mset_detect_anomaly(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_detect_anomaly):\n\u001b[0;32m-> 1030\u001b[0m         \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_loop\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1031\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1032\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnexpected state \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:205\u001b[0m, in \u001b[0;36m_FitLoop.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    203\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    204\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mon_advance_start()\n\u001b[0;32m--> 205\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43madvance\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    206\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mon_advance_end()\n\u001b[1;32m    207\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_restarting \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:363\u001b[0m, in \u001b[0;36m_FitLoop.advance\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    361\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtrainer\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mprofile(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrun_training_epoch\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    362\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_data_fetcher \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 363\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mepoch_loop\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_data_fetcher\u001b[49m\u001b[43m)\u001b[49m\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/training_epoch_loop.py:140\u001b[0m, in \u001b[0;36m_TrainingEpochLoop.run\u001b[0;34m(self, data_fetcher)\u001b[0m\n\u001b[1;32m    138\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdone:\n\u001b[1;32m    139\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 140\u001b[0m         \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43madvance\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata_fetcher\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    141\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mon_advance_end(data_fetcher)\n\u001b[1;32m    142\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_restarting \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/training_epoch_loop.py:250\u001b[0m, in \u001b[0;36m_TrainingEpochLoop.advance\u001b[0;34m(self, data_fetcher)\u001b[0m\n\u001b[1;32m    247\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mprofile(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrun_training_batch\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    248\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mlightning_module\u001b[38;5;241m.\u001b[39mautomatic_optimization:\n\u001b[1;32m    249\u001b[0m         \u001b[38;5;66;03m# in automatic optimization, there can only be one optimizer\u001b[39;00m\n\u001b[0;32m--> 250\u001b[0m         batch_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautomatic_optimization\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptimizers\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    251\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    252\u001b[0m         batch_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmanual_optimization\u001b[38;5;241m.\u001b[39mrun(kwargs)\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/optimization/automatic.py:190\u001b[0m, in \u001b[0;36m_AutomaticOptimization.run\u001b[0;34m(self, optimizer, batch_idx, kwargs)\u001b[0m\n\u001b[1;32m    183\u001b[0m         closure()\n\u001b[1;32m    185\u001b[0m \u001b[38;5;66;03m# ------------------------------\u001b[39;00m\n\u001b[1;32m    186\u001b[0m \u001b[38;5;66;03m# BACKWARD PASS\u001b[39;00m\n\u001b[1;32m    187\u001b[0m \u001b[38;5;66;03m# ------------------------------\u001b[39;00m\n\u001b[1;32m    188\u001b[0m \u001b[38;5;66;03m# gradient update with accumulated gradients\u001b[39;00m\n\u001b[1;32m    189\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 190\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_optimizer_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbatch_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mclosure\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    192\u001b[0m result \u001b[38;5;241m=\u001b[39m closure\u001b[38;5;241m.\u001b[39mconsume_result()\n\u001b[1;32m    193\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m result\u001b[38;5;241m.\u001b[39mloss \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/optimization/automatic.py:268\u001b[0m, in \u001b[0;36m_AutomaticOptimization._optimizer_step\u001b[0;34m(self, batch_idx, train_step_and_backward_closure)\u001b[0m\n\u001b[1;32m    265\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptim_progress\u001b[38;5;241m.\u001b[39moptimizer\u001b[38;5;241m.\u001b[39mstep\u001b[38;5;241m.\u001b[39mincrement_ready()\n\u001b[1;32m    267\u001b[0m \u001b[38;5;66;03m# model hook\u001b[39;00m\n\u001b[0;32m--> 268\u001b[0m \u001b[43mcall\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_lightning_module_hook\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    269\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtrainer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    270\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43moptimizer_step\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m    271\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcurrent_epoch\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    272\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbatch_idx\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    273\u001b[0m \u001b[43m    \u001b[49m\u001b[43moptimizer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    274\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtrain_step_and_backward_closure\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    275\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    277\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m should_accumulate:\n\u001b[1;32m    278\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptim_progress\u001b[38;5;241m.\u001b[39moptimizer\u001b[38;5;241m.\u001b[39mstep\u001b[38;5;241m.\u001b[39mincrement_completed()\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py:159\u001b[0m, in \u001b[0;36m_call_lightning_module_hook\u001b[0;34m(trainer, hook_name, pl_module, *args, **kwargs)\u001b[0m\n\u001b[1;32m    156\u001b[0m pl_module\u001b[38;5;241m.\u001b[39m_current_fx_name \u001b[38;5;241m=\u001b[39m hook_name\n\u001b[1;32m    158\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mprofile(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m[LightningModule]\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpl_module\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhook_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 159\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    161\u001b[0m \u001b[38;5;66;03m# restore current_fx when nested context\u001b[39;00m\n\u001b[1;32m    162\u001b[0m pl_module\u001b[38;5;241m.\u001b[39m_current_fx_name \u001b[38;5;241m=\u001b[39m prev_fx_name\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/module.py:1308\u001b[0m, in \u001b[0;36mLightningModule.optimizer_step\u001b[0;34m(self, epoch, batch_idx, optimizer, optimizer_closure)\u001b[0m\n\u001b[1;32m   1277\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21moptimizer_step\u001b[39m(\n\u001b[1;32m   1278\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m   1279\u001b[0m     epoch: \u001b[38;5;28mint\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1282\u001b[0m     optimizer_closure: Optional[Callable[[], Any]] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m   1283\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   1284\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"Override this method to adjust the default way the :class:`~pytorch_lightning.trainer.trainer.Trainer` calls\u001b[39;00m\n\u001b[1;32m   1285\u001b[0m \u001b[38;5;124;03m    the optimizer.\u001b[39;00m\n\u001b[1;32m   1286\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1306\u001b[0m \n\u001b[1;32m   1307\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1308\u001b[0m     \u001b[43moptimizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43mclosure\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptimizer_closure\u001b[49m\u001b[43m)\u001b[49m\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/optimizer.py:153\u001b[0m, in \u001b[0;36mLightningOptimizer.step\u001b[0;34m(self, closure, **kwargs)\u001b[0m\n\u001b[1;32m    150\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m MisconfigurationException(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWhen `optimizer.step(closure)` is called, the closure should be callable\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    152\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_strategy \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m step_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_strategy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptimizer_step\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_optimizer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mclosure\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    155\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_on_after_step()\n\u001b[1;32m    157\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m step_output\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/strategy.py:238\u001b[0m, in \u001b[0;36mStrategy.optimizer_step\u001b[0;34m(self, optimizer, closure, model, **kwargs)\u001b[0m\n\u001b[1;32m    236\u001b[0m \u001b[38;5;66;03m# TODO(fabric): remove assertion once strategy's optimizer_step typing is fixed\u001b[39;00m\n\u001b[1;32m    237\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(model, pl\u001b[38;5;241m.\u001b[39mLightningModule)\n\u001b[0;32m--> 238\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprecision_plugin\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptimizer_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43moptimizer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mclosure\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclosure\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/plugins/precision/precision.py:122\u001b[0m, in \u001b[0;36mPrecision.optimizer_step\u001b[0;34m(self, optimizer, model, closure, **kwargs)\u001b[0m\n\u001b[1;32m    120\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Hook to run the optimizer step.\"\"\"\u001b[39;00m\n\u001b[1;32m    121\u001b[0m closure \u001b[38;5;241m=\u001b[39m partial(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_wrap_closure, model, optimizer, closure)\n\u001b[0;32m--> 122\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43moptimizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43mclosure\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclosure\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py:68\u001b[0m, in \u001b[0;36mLRScheduler.__init__.<locals>.with_counter.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     66\u001b[0m instance\u001b[38;5;241m.\u001b[39m_step_count \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m     67\u001b[0m wrapped \u001b[38;5;241m=\u001b[39m func\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__get__\u001b[39m(instance, \u001b[38;5;28mcls\u001b[39m)\n\u001b[0;32m---> 68\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mwrapped\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/optim/optimizer.py:76\u001b[0m, in \u001b[0;36m_use_grad_for_differentiable.<locals>._use_grad\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m     74\u001b[0m     torch\u001b[38;5;241m.\u001b[39mset_grad_enabled(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefaults[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdifferentiable\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m     75\u001b[0m     torch\u001b[38;5;241m.\u001b[39m_dynamo\u001b[38;5;241m.\u001b[39mgraph_break()\n\u001b[0;32m---> 76\u001b[0m     ret \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     77\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m     78\u001b[0m     torch\u001b[38;5;241m.\u001b[39m_dynamo\u001b[38;5;241m.\u001b[39mgraph_break()\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/optim/adamw.py:164\u001b[0m, in \u001b[0;36mAdamW.step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m    162\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m closure \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    163\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39menable_grad():\n\u001b[0;32m--> 164\u001b[0m         loss \u001b[38;5;241m=\u001b[39m \u001b[43mclosure\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    166\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m group \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mparam_groups:\n\u001b[1;32m    167\u001b[0m     params_with_grad \u001b[38;5;241m=\u001b[39m []\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/plugins/precision/precision.py:108\u001b[0m, in \u001b[0;36mPrecision._wrap_closure\u001b[0;34m(self, model, optimizer, closure)\u001b[0m\n\u001b[1;32m     95\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_wrap_closure\u001b[39m(\n\u001b[1;32m     96\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m     97\u001b[0m     model: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpl.LightningModule\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     98\u001b[0m     optimizer: Optimizer,\n\u001b[1;32m     99\u001b[0m     closure: Callable[[], Any],\n\u001b[1;32m    100\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[1;32m    101\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"This double-closure allows makes sure the ``closure`` is executed before the ``on_before_optimizer_step``\u001b[39;00m\n\u001b[1;32m    102\u001b[0m \u001b[38;5;124;03m    hook is called.\u001b[39;00m\n\u001b[1;32m    103\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    106\u001b[0m \n\u001b[1;32m    107\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 108\u001b[0m     closure_result \u001b[38;5;241m=\u001b[39m \u001b[43mclosure\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    109\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_after_closure(model, optimizer)\n\u001b[1;32m    110\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m closure_result\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/optimization/automatic.py:144\u001b[0m, in \u001b[0;36mClosure.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    142\u001b[0m \u001b[38;5;129m@override\u001b[39m\n\u001b[1;32m    143\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs: Any, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Optional[Tensor]:\n\u001b[0;32m--> 144\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclosure\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    145\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_result\u001b[38;5;241m.\u001b[39mloss\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py:115\u001b[0m, in \u001b[0;36mcontext_decorator.<locals>.decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    112\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m    113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m    114\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 115\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/optimization/automatic.py:129\u001b[0m, in \u001b[0;36mClosure.closure\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    126\u001b[0m \u001b[38;5;129m@override\u001b[39m\n\u001b[1;32m    127\u001b[0m \u001b[38;5;129m@torch\u001b[39m\u001b[38;5;241m.\u001b[39menable_grad()\n\u001b[1;32m    128\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mclosure\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs: Any, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m ClosureResult:\n\u001b[0;32m--> 129\u001b[0m     step_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_step_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    131\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m step_output\u001b[38;5;241m.\u001b[39mclosure_loss \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    132\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mwarning_cache\u001b[38;5;241m.\u001b[39mwarn(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m`training_step` returned `None`. If this was on purpose, ignore this warning...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/optimization/automatic.py:317\u001b[0m, in \u001b[0;36m_AutomaticOptimization._training_step\u001b[0;34m(self, kwargs)\u001b[0m\n\u001b[1;32m    306\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Performs the actual train step with the tied hooks.\u001b[39;00m\n\u001b[1;32m    307\u001b[0m \n\u001b[1;32m    308\u001b[0m \u001b[38;5;124;03mArgs:\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    313\u001b[0m \n\u001b[1;32m    314\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    315\u001b[0m trainer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtrainer\n\u001b[0;32m--> 317\u001b[0m training_step_output \u001b[38;5;241m=\u001b[39m \u001b[43mcall\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_strategy_hook\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrainer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtraining_step\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    318\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtrainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mpost_training_step()  \u001b[38;5;66;03m# unused hook - call anyway for backward compatibility\u001b[39;00m\n\u001b[1;32m    320\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m training_step_output \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mworld_size \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py:311\u001b[0m, in \u001b[0;36m_call_strategy_hook\u001b[0;34m(trainer, hook_name, *args, **kwargs)\u001b[0m\n\u001b[1;32m    308\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    310\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mprofile(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m[Strategy]\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtrainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhook_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 311\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    313\u001b[0m \u001b[38;5;66;03m# restore current_fx when nested context\u001b[39;00m\n\u001b[1;32m    314\u001b[0m pl_module\u001b[38;5;241m.\u001b[39m_current_fx_name \u001b[38;5;241m=\u001b[39m prev_fx_name\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/strategy.py:390\u001b[0m, in \u001b[0;36mStrategy.training_step\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    388\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlightning_module:\n\u001b[1;32m    389\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_redirection(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlightning_module, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtraining_step\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 390\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlightning_module\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/nemo/utils/model_utils.py:381\u001b[0m, in \u001b[0;36mwrap_training_step\u001b[0;34m(wrapped, instance, args, kwargs)\u001b[0m\n\u001b[1;32m    379\u001b[0m \u001b[38;5;129m@wrapt\u001b[39m\u001b[38;5;241m.\u001b[39mdecorator\n\u001b[1;32m    380\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrap_training_step\u001b[39m(wrapped, instance: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpl.LightningModule\u001b[39m\u001b[38;5;124m'\u001b[39m, args, kwargs):\n\u001b[0;32m--> 381\u001b[0m     output_dict \u001b[38;5;241m=\u001b[39m \u001b[43mwrapped\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    383\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(output_dict, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m output_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlog\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;129;01min\u001b[39;00m output_dict:\n\u001b[1;32m    384\u001b[0m         log_dict \u001b[38;5;241m=\u001b[39m output_dict\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlog\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/nemo/collections/asr/models/ctc_models.py:591\u001b[0m, in \u001b[0;36mEncDecCTCModel.training_step\u001b[0;34m(self, batch, batch_nb)\u001b[0m\n\u001b[1;32m    587\u001b[0m     log_probs, encoded_len, predictions \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mforward(\n\u001b[1;32m    588\u001b[0m         processed_signal\u001b[38;5;241m=\u001b[39msignal, processed_signal_length\u001b[38;5;241m=\u001b[39msignal_len\n\u001b[1;32m    589\u001b[0m     )\n\u001b[1;32m    590\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 591\u001b[0m     log_probs, encoded_len, predictions \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_signal\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msignal\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minput_signal_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msignal_len\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    593\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_trainer\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_trainer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    594\u001b[0m     log_every_n_steps \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_trainer\u001b[38;5;241m.\u001b[39mlog_every_n_steps\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/nemo/core/classes/common.py:1098\u001b[0m, in \u001b[0;36mtypecheck.__call__\u001b[0;34m(self, wrapped, instance, args, kwargs)\u001b[0m\n\u001b[1;32m   1095\u001b[0m instance\u001b[38;5;241m.\u001b[39m_validate_input_types(input_types\u001b[38;5;241m=\u001b[39minput_types, ignore_collections\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mignore_collections, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1097\u001b[0m \u001b[38;5;66;03m# Call the method - this can be forward, or any other callable method\u001b[39;00m\n\u001b[0;32m-> 1098\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mwrapped\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1100\u001b[0m instance\u001b[38;5;241m.\u001b[39m_attach_and_validate_output_types(\n\u001b[1;32m   1101\u001b[0m     output_types\u001b[38;5;241m=\u001b[39moutput_types, ignore_collections\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mignore_collections, out_objects\u001b[38;5;241m=\u001b[39moutputs\n\u001b[1;32m   1102\u001b[0m )\n\u001b[1;32m   1104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m outputs\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/nemo/collections/asr/models/ctc_models.py:564\u001b[0m, in \u001b[0;36mEncDecCTCModel.forward\u001b[0;34m(self, input_signal, input_signal_length, processed_signal, processed_signal_length)\u001b[0m\n\u001b[1;32m    561\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mspec_augmentation \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining:\n\u001b[1;32m    562\u001b[0m     processed_signal \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mspec_augmentation(input_spec\u001b[38;5;241m=\u001b[39mprocessed_signal, length\u001b[38;5;241m=\u001b[39mprocessed_signal_length)\n\u001b[0;32m--> 564\u001b[0m encoder_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoder\u001b[49m\u001b[43m(\u001b[49m\u001b[43maudio_signal\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprocessed_signal\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlength\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprocessed_signal_length\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    565\u001b[0m encoded \u001b[38;5;241m=\u001b[39m encoder_output[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    566\u001b[0m encoded_len \u001b[38;5;241m=\u001b[39m encoder_output[\u001b[38;5;241m1\u001b[39m]\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1510\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1508\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1509\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1510\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1519\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1514\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1515\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1516\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1517\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1518\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1519\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1521\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1522\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/nemo/core/classes/common.py:1098\u001b[0m, in \u001b[0;36mtypecheck.__call__\u001b[0;34m(self, wrapped, instance, args, kwargs)\u001b[0m\n\u001b[1;32m   1095\u001b[0m instance\u001b[38;5;241m.\u001b[39m_validate_input_types(input_types\u001b[38;5;241m=\u001b[39minput_types, ignore_collections\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mignore_collections, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1097\u001b[0m \u001b[38;5;66;03m# Call the method - this can be forward, or any other callable method\u001b[39;00m\n\u001b[0;32m-> 1098\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mwrapped\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1100\u001b[0m instance\u001b[38;5;241m.\u001b[39m_attach_and_validate_output_types(\n\u001b[1;32m   1101\u001b[0m     output_types\u001b[38;5;241m=\u001b[39moutput_types, ignore_collections\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mignore_collections, out_objects\u001b[38;5;241m=\u001b[39moutputs\n\u001b[1;32m   1102\u001b[0m )\n\u001b[1;32m   1104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m outputs\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/nemo/collections/asr/modules/conformer_encoder.py:491\u001b[0m, in \u001b[0;36mConformerEncoder.forward\u001b[0;34m(self, audio_signal, length, cache_last_channel, cache_last_time, cache_last_channel_len)\u001b[0m\n\u001b[1;32m    487\u001b[0m \u001b[38;5;129m@typecheck\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m    489\u001b[0m     \u001b[38;5;28mself\u001b[39m, audio_signal, length, cache_last_channel\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, cache_last_time\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, cache_last_channel_len\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    490\u001b[0m ):\n\u001b[0;32m--> 491\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward_internal\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    492\u001b[0m \u001b[43m        \u001b[49m\u001b[43maudio_signal\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    493\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlength\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    494\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcache_last_channel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_last_channel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    495\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcache_last_time\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_last_time\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    496\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcache_last_channel_len\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_last_channel_len\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    497\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/nemo/collections/asr/modules/conformer_encoder.py:521\u001b[0m, in \u001b[0;36mConformerEncoder.forward_internal\u001b[0;34m(self, audio_signal, length, cache_last_channel, cache_last_time, cache_last_channel_len)\u001b[0m\n\u001b[1;32m    519\u001b[0m     audio_signal \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpre_encode(audio_signal)\n\u001b[1;32m    520\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 521\u001b[0m     audio_signal, length \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpre_encode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maudio_signal\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlengths\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlength\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    522\u001b[0m     length \u001b[38;5;241m=\u001b[39m length\u001b[38;5;241m.\u001b[39mto(torch\u001b[38;5;241m.\u001b[39mint64)\n\u001b[1;32m    523\u001b[0m     \u001b[38;5;66;03m# self.streaming_cfg is set by setup_streaming_cfg(), called in the init\u001b[39;00m\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1510\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1508\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1509\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1510\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1519\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1514\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1515\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1516\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1517\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1518\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1519\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1521\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1522\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/nemo/collections/asr/parts/submodules/subsampling.py:432\u001b[0m, in \u001b[0;36mConvSubsampling.forward\u001b[0;34m(self, x, lengths)\u001b[0m\n\u001b[1;32m    430\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconv2d_subsampling:\n\u001b[1;32m    431\u001b[0m     b, c, t, f \u001b[38;5;241m=\u001b[39m x\u001b[38;5;241m.\u001b[39msize()\n\u001b[0;32m--> 432\u001b[0m     x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mout\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtranspose\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreshape\u001b[49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    433\u001b[0m \u001b[38;5;66;03m# Transpose to Channel Last mode\u001b[39;00m\n\u001b[1;32m    434\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    435\u001b[0m     x \u001b[38;5;241m=\u001b[39m x\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m)\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1510\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1508\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1509\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1510\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1519\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1514\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1515\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1516\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1517\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1518\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1519\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1521\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1522\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/linear.py:114\u001b[0m, in \u001b[0;36mLinear.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m    113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 114\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlinear\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m)\u001b[49m\n",
+            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/signal_handling.py:66\u001b[0m, in \u001b[0;36m_set_SIGCHLD_handler.<locals>.handler\u001b[0;34m(signum, frame)\u001b[0m\n\u001b[1;32m     63\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mhandler\u001b[39m(signum, frame):\n\u001b[1;32m     64\u001b[0m     \u001b[38;5;66;03m# This following call uses `waitid` with WNOHANG from C side. Therefore,\u001b[39;00m\n\u001b[1;32m     65\u001b[0m     \u001b[38;5;66;03m# Python can still get and update the process status successfully.\u001b[39;00m\n\u001b[0;32m---> 66\u001b[0m     \u001b[43m_error_if_any_worker_fails\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     67\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m previous_handler \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     68\u001b[0m         \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mcallable\u001b[39m(previous_handler)\n",
+            "\u001b[0;31mRuntimeError\u001b[0m: DataLoader worker (pid 2281096) is killed by signal: Bus error. It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit."
+          ]
+        }
+      ],
+      "source": [
+        "# Finally, train the adapters\n",
+        "trainer.fit(model)"
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "A82ylXSZuL1T"
+      },
       "source": [
         "-----\n",
         "\n",
         "After training, save the final checkpoint to a nemo file to evaluate. We also save just the adapter module itself, as that is much smaller than the size of the full model."
-      ],
-      "metadata": {
-        "id": "A82ylXSZuL1T"
-      }
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "model.save_to(\"/content/adapted_model.nemo\")"
-      ],
+      "execution_count": null,
       "metadata": {
         "id": "7tDdE9lZbvhJ"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "model.save_to(\"/content/adapted_model.nemo\")"
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "model.save_adapters('/content/adapter_modules.pt')"
-      ],
+      "execution_count": null,
       "metadata": {
         "id": "L9yO-M-oL3Cy"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "model.save_adapters('/content/adapter_modules.pt')"
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "Ak9v58RmdNJT"
+      },
       "source": [
         "# Evaluate the adapted model\n",
         "\n",
         "Now that we have finished the adaptation step and saved a trained NeMo file, we can evaluate the accuracy of our adapted model on the test set of AN4."
-      ],
-      "metadata": {
-        "id": "Ak9v58RmdNJT"
-      }
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "## Evaluate the adapter-enabled model"
-      ],
       "metadata": {
         "id": "r-rjNJAvuZxu"
-      }
+      },
+      "source": [
+        "## Evaluate the adapter-enabled model"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "_Ps6_45mdJpM"
+      },
+      "outputs": [],
       "source": [
         "!python scripts/speech_to_text_eval.py \\\n",
         "  model_path=\"/content/adapted_model.nemo\" \\\n",
@@ -1138,74 +1692,74 @@
         "  output_filename=\"/content/adapted_predictions.json\" \\\n",
         "  batch_size=32 \\\n",
         "  use_cer=False"
-      ],
-      "metadata": {
-        "id": "_Ps6_45mdJpM"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "MegsAIcQG5MJ"
+      },
       "source": [
         "-----\n",
         "\n",
         "We could significantly improve the accuracy of this model on the AN4 dataset with a very short training schedule with a small number of parameters. \n",
         "\n",
         "**Note**: Since AN4 is a relatively simple dataset, the gains are very large in this example. We generally observe more modest improvements with such short training schedules on realistic datasets (but gains of this range are easily attainable with more data or precise training schedules to avoid overfitting)."
-      ],
-      "metadata": {
-        "id": "MegsAIcQG5MJ"
-      }
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "Let us compare the adapted model's predictions below - "
-      ],
       "metadata": {
         "id": "T6c_p530wMwG"
-      }
+      },
+      "source": [
+        "Let us compare the adapted model's predictions below - "
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "vlK3PdMhtlv1"
+      },
+      "outputs": [],
       "source": [
         "print(\"Original\")\n",
         "!head -n 5 /content/unadapted_predictions.json\n",
         "print(\"Adapted\")\n",
         "!head -n 5 /content/adapted_predictions.json"
-      ],
-      "metadata": {
-        "id": "vlK3PdMhtlv1"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "CCkH10jqd4q1"
+      },
       "source": [
         "## Evaluate the adapter-disabled model\n",
         "\n",
         "Now, let us disable the adapters and recover the original performance of the model. We do this as a sanity test, to check that indeed the \"base\" model is still intact, even if adapter training has occurred."
-      ],
-      "metadata": {
-        "id": "CCkH10jqd4q1"
-      }
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "model.set_enabled_adapters(enabled=False)\n",
-        "model.save_to(\"/content/adapter_disabled_model.nemo\")"
-      ],
+      "execution_count": null,
       "metadata": {
         "id": "1R6wHGgRdRKX"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "model.set_enabled_adapters(enabled=False)\n",
+        "model.save_to(\"/content/adapter_disabled_model.nemo\")"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "IhGLtRwdeGRf"
+      },
+      "outputs": [],
       "source": [
         "!python scripts/speech_to_text_eval.py \\\n",
         "  model_path=\"/content/adapter_disabled_model.nemo\" \\\n",
@@ -1213,26 +1767,26 @@
         "  output_filename=\"/content/adapter_disabled_predictions.json\" \\\n",
         "  batch_size=32 \\\n",
         "  use_cer=False"
-      ],
-      "metadata": {
-        "id": "IhGLtRwdeGRf"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "GGKgsW5gvAuf"
+      },
       "source": [
         "# [EXTRA] Check that accuracy can be recovered after adaptation\n",
         "\n",
         "This is a more explicit test than simply checking the WER above - here we do sample by sample check to ensure that predicted text remains the same."
-      ],
-      "metadata": {
-        "id": "GGKgsW5gvAuf"
-      }
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "YFKN7QYuvBzP"
+      },
+      "outputs": [],
       "source": [
         "original_transcripts = read_manifest('/content/unadapted_predictions.json')\n",
         "adapter_disabled_transcripts = read_manifest('/content/adapter_disabled_predictions.json')\n",
@@ -1244,39 +1798,39 @@
         "    print(\"Original = \", orig['pred_text'])\n",
         "    print(\"Adapters disabled = \", new['pred_text']) \n",
         "    print()"
-      ],
-      "metadata": {
-        "id": "YFKN7QYuvBzP"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "0CqpJGR6ecYW"
+      },
       "source": [
         "# [EXTRA] Add as many adapters as needed\n",
         "\n",
         "Now that we have showcased how to utilize adapters for domain adaptation, we can take this further and adapt even more datasets - as many as needed!\n",
         "\n",
         "There is no implicit restriction on how many adapters can be added, as shown below. Still, we do recommend freezing all adapters and training only one at a time to prevent cross-interaction between adapters."
-      ],
-      "metadata": {
-        "id": "0CqpJGR6ecYW"
-      }
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "model.add_adapter(name=\"AN4-v2\", cfg=adapter_cfg)"
-      ],
+      "execution_count": null,
       "metadata": {
         "id": "13vZHFFEeK_g"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "model.add_adapter(name=\"AN4-v2\", cfg=adapter_cfg)"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "iOrJ72SUelp6"
+      },
+      "outputs": [],
       "source": [
         "model.set_enabled_adapters(enabled=False)\n",
         "model.set_enabled_adapters(name='AN4-v2', enabled=True)\n",
@@ -1285,15 +1839,13 @@
         "model.unfreeze_enabled_adapters()\n",
         "\n",
         "model.summarize()"
-      ],
-      "metadata": {
-        "id": "iOrJ72SUelp6"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "EIli6c_OvKDH"
+      },
       "source": [
         "# Further reading\n",
         "\n",
@@ -1302,10 +1854,37 @@
         "Please follow the following articles that discuss the use of adapters in ASR - \n",
         "- [Exploiting Adapters for Cross-lingual Low-resource Speech Recognition](https://arxiv.org/abs/2105.11905)\n",
         "- [Efficient Adapter Transfer of Self-Supervised Speech Models for Automatic Speech Recognition](https://arxiv.org/abs/2202.03218)\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [
+        "dm-qqTdZDUlZ",
+        "GGKgsW5gvAuf",
+        "0CqpJGR6ecYW"
       ],
-      "metadata": {
-        "id": "EIli6c_OvKDH"
-      }
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.10.12"
     }
-  ]
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
 }

From 28989fa5c6bddf3ce27bf83cc6781f6bfc3d1bba Mon Sep 17 00:00:00 2001
From: kolubex <darurlakshmipathibalaji@gmail.com>
Date: Thu, 11 Jul 2024 08:56:03 +0000
Subject: [PATCH 2/4] working_code_it1

---
 .gitignore                                    |    1 +
 .../ctc_model_QuartzNet15x5Base copy.yaml     |  149 ++
 balu_codes/ctc_model_QuartzNet15x5Base.yaml   |  265 +++
 .../model_config_from_transcribe_py copy.yaml |  259 +++
 .../model_config_from_transcribe_py.yaml      |  269 +++
 balu_codes/testing_av_code.ipynb              |  431 +++++
 balu_codes/transcribe.py                      |   11 +-
 nemo/collections/asr/data/av_to_text.py       |   29 +-
 nemo/collections/asr/models/__init__.py       |    1 +
 nemo/collections/asr/models/av_ctc_models.py  |   67 +-
 .../common/parts/preprocessing/collections.py |    4 +-
 tutorials/asr/Streaming_ASR.ipynb             |  248 ++-
 .../asr/asr_adapters/ASR_with_Adapters.ipynb  | 1676 ++++++++++++++---
 13 files changed, 2998 insertions(+), 412 deletions(-)
 create mode 100644 balu_codes/ctc_model_QuartzNet15x5Base copy.yaml
 create mode 100644 balu_codes/ctc_model_QuartzNet15x5Base.yaml
 create mode 100644 balu_codes/model_config_from_transcribe_py copy.yaml
 create mode 100644 balu_codes/model_config_from_transcribe_py.yaml
 create mode 100644 balu_codes/testing_av_code.ipynb

diff --git a/.gitignore b/.gitignore
index 1ff2a92cac64..4bc9a2cacb4f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -179,3 +179,4 @@ examples/neural_graphs/*.yml
 .hydra/
 nemo_experiments/
 
+balu_codes/test_experiments/
\ No newline at end of file
diff --git a/balu_codes/ctc_model_QuartzNet15x5Base copy.yaml b/balu_codes/ctc_model_QuartzNet15x5Base copy.yaml
new file mode 100644
index 000000000000..4c40015a224d
--- /dev/null
+++ b/balu_codes/ctc_model_QuartzNet15x5Base copy.yaml	
@@ -0,0 +1,149 @@
+preprocessor:
+  cls: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+  params:
+    normalize: per_feature
+    window_size: 0.02
+    sample_rate: 16000
+    window_stride: 0.01
+    window: hann
+    features: 64
+    n_fft: 512
+    frame_splicing: 1
+    dither: 1.0e-05
+    stft_conv: false
+spec_augment:
+  cls: nemo.collections.asr.modules.SpectrogramAugmentation
+  params:
+    rect_freq: 50
+    rect_masks: 5
+    rect_time: 120
+
+a_model_name: QuartzNet15x5Base-En
+sample_rate: 16000
+labels: 
+  - ' '
+  - a
+  - b
+  - c
+  - d
+  - e
+  - f
+  - g
+  - h
+  - i
+  - j
+  - k
+  - l
+  - m
+  - n
+  - o
+  - p
+  - q
+  - r
+  - s
+  - t
+  - u
+  - v
+  - w
+  - x
+  - y
+  - z
+  - ''''
+train_ds:
+  manifest_filepath: /disk1/it1/annotations/manifest_train.json
+  video_frame_rate: 5
+  # - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/tarred_audio_manifest.json #TBD
+  sample_rate: 16000
+  batch_size: 1
+  shuffle: true
+  num_workers: 0
+  pin_memory: true
+  use_start_end_token: false
+  trim_silence: false
+  max_duration: 20.0
+  min_duration: 0.1
+  is_tarred: false
+  tarred_audio_filepaths: null
+  shuffle_n: 2048
+  bucketing_strategy: synced_randomized
+  bucketing_batch_size:
+  - 34
+  - 30
+  - 26
+  - 22
+  - 18
+  - 16
+  - 12
+  - 8
+
+validation_ds:
+  manifest_filepath: /disk1/it1/annotations/manifest_train.json
+  video_frame_rate: 5
+  # - /manifests/librispeech/librivox-dev-other.json #TBD
+  sample_rate: 16000
+  batch_size: 1
+  shuffle: false
+  num_workers: 0
+  pin_memory: true
+  use_start_end_token: false
+test_ds:
+  manifest_filepath: /disk1/it1/annotations/manifest_train.json
+  # - /manifests/librispeech/librivox-dev-other.json #TBD
+  sample_rate: 16000
+  batch_size: 1
+  shuffle: false
+  num_workers: 0
+  pin_memory: true
+  use_start_end_token: false
+
+av_encoder:
+  d_model: 512
+  nhead: 4
+  num_layers: 2
+  dropout: 0.1
+
+v_model:
+  feat_dim: 512
+
+decoder:
+  cls: nemo.collections.asr.modules.ConvASRDecoder
+  params:
+    feat_in: 512
+    num_classes: 28
+    vocabulary:
+    - ' '
+    - a
+    - b
+    - c
+    - d
+    - e
+    - f
+    - g
+    - h
+    - i
+    - j
+    - k
+    - l
+    - m
+    - n
+    - o
+    - p
+    - q
+    - r
+    - s
+    - t
+    - u
+    - v
+    - w
+    - x
+    - y
+    - z
+    - ''''
+optim:
+  name: novograd
+  lr: 0.01
+  betas:
+  - 0.8
+  - 0.5
+  weight_decay: 0.001
+target: nemo.collections.asr.models.av_ctc_bpe_models.AV_EncDecCTCModelBPE
diff --git a/balu_codes/ctc_model_QuartzNet15x5Base.yaml b/balu_codes/ctc_model_QuartzNet15x5Base.yaml
new file mode 100644
index 000000000000..d48014222f13
--- /dev/null
+++ b/balu_codes/ctc_model_QuartzNet15x5Base.yaml
@@ -0,0 +1,265 @@
+preprocessor:
+  cls: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+  params:
+    normalize: per_feature
+    window_size: 0.02
+    sample_rate: 16000
+    window_stride: 0.01
+    window: hann
+    features: 64
+    n_fft: 512
+    frame_splicing: 1
+    dither: 1.0e-05
+    stft_conv: false
+spec_augment:
+  cls: nemo.collections.asr.modules.SpectrogramAugmentation
+  params:
+    rect_freq: 50
+    rect_masks: 5
+    rect_time: 120
+encoder:
+  cls: nemo.collections.asr.modules.ConvASREncoder
+  params:
+    feat_in: 64
+    activation: relu
+    conv_mask: true
+    jasper:
+    - filters: 256
+      repeat: 1
+      kernel:
+      - 33
+      stride:
+      - 2
+      dilation:
+      - 1
+      dropout: 0.0
+      residual: false
+      separable: true
+    - filters: 256
+      repeat: 5
+      kernel:
+      - 33
+      stride:
+      - 1
+      dilation:
+      - 1
+      dropout: 0.0
+      residual: true
+      separable: true
+    - filters: 256
+      repeat: 5
+      kernel:
+      - 33
+      stride:
+      - 1
+      dilation:
+      - 1
+      dropout: 0.0
+      residual: true
+      separable: true
+    - filters: 256
+      repeat: 5
+      kernel:
+      - 33
+      stride:
+      - 1
+      dilation:
+      - 1
+      dropout: 0.0
+      residual: true
+      separable: true
+    - filters: 256
+      repeat: 5
+      kernel:
+      - 39
+      stride:
+      - 1
+      dilation:
+      - 1
+      dropout: 0.0
+      residual: true
+      separable: true
+    - filters: 256
+      repeat: 5
+      kernel:
+      - 39
+      stride:
+      - 1
+      dilation:
+      - 1
+      dropout: 0.0
+      residual: true
+      separable: true
+    - filters: 256
+      repeat: 5
+      kernel:
+      - 39
+      stride:
+      - 1
+      dilation:
+      - 1
+      dropout: 0.0
+      residual: true
+      separable: true
+    - filters: 512
+      repeat: 5
+      kernel:
+      - 51
+      stride:
+      - 1
+      dilation:
+      - 1
+      dropout: 0.0
+      residual: true
+      separable: true
+    - filters: 512
+      repeat: 5
+      kernel:
+      - 51
+      stride:
+      - 1
+      dilation:
+      - 1
+      dropout: 0.0
+      residual: true
+      separable: true
+    - filters: 512
+      repeat: 5
+      kernel:
+      - 51
+      stride:
+      - 1
+      dilation:
+      - 1
+      dropout: 0.0
+      residual: true
+      separable: true
+    - filters: 512
+      repeat: 5
+      kernel:
+      - 63
+      stride:
+      - 1
+      dilation:
+      - 1
+      dropout: 0.0
+      residual: true
+      separable: true
+    - filters: 512
+      repeat: 5
+      kernel:
+      - 63
+      stride:
+      - 1
+      dilation:
+      - 1
+      dropout: 0.0
+      residual: true
+      separable: true
+    - filters: 512
+      repeat: 5
+      kernel:
+      - 63
+      stride:
+      - 1
+      dilation:
+      - 1
+      dropout: 0.0
+      residual: true
+      separable: true
+    - filters: 512
+      repeat: 5
+      kernel:
+      - 75
+      stride:
+      - 1
+      dilation:
+      - 1
+      dropout: 0.0
+      residual: true
+      separable: true
+    - filters: 512
+      repeat: 5
+      kernel:
+      - 75
+      stride:
+      - 1
+      dilation:
+      - 1
+      dropout: 0.0
+      residual: true
+      separable: true
+    - filters: 512
+      repeat: 5
+      kernel:
+      - 75
+      stride:
+      - 1
+      dilation:
+      - 1
+      dropout: 0.0
+      residual: true
+      separable: true
+    - filters: 512
+      repeat: 1
+      kernel:
+      - 87
+      stride:
+      - 1
+      dilation:
+      - 2
+      dropout: 0.0
+      residual: false
+      separable: true
+    - filters: 1024
+      repeat: 1
+      kernel:
+      - 1
+      stride:
+      - 1
+      dilation:
+      - 1
+      dropout: 0.0
+      residual: false
+decoder:
+  cls: nemo.collections.asr.modules.ConvASRDecoder
+  params:
+    feat_in: 1024
+    num_classes: 28
+    vocabulary:
+    - ' '
+    - a
+    - b
+    - c
+    - d
+    - e
+    - f
+    - g
+    - h
+    - i
+    - j
+    - k
+    - l
+    - m
+    - n
+    - o
+    - p
+    - q
+    - r
+    - s
+    - t
+    - u
+    - v
+    - w
+    - x
+    - y
+    - z
+    - ''''
+optim:
+  name: novograd
+  lr: 0.01
+  betas:
+  - 0.8
+  - 0.5
+  weight_decay: 0.001
+target: nemo.collections.asr.models.ctc_models.EncDecCTCModel
diff --git a/balu_codes/model_config_from_transcribe_py copy.yaml b/balu_codes/model_config_from_transcribe_py copy.yaml
new file mode 100644
index 000000000000..1a511e019423
--- /dev/null
+++ b/balu_codes/model_config_from_transcribe_py copy.yaml	
@@ -0,0 +1,259 @@
+sample_rate: 16000
+log_prediction: true
+ctc_reduction: mean_batch
+skip_nan_grad: false
+a_model_name: #TBD
+train_ds:
+  manifest_filepath:
+  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/tarred_audio_manifest.json #TBD
+  sample_rate: 16000
+  batch_size: 32
+  shuffle: true
+  num_workers: 4
+  pin_memory: true
+  use_start_end_token: false
+  trim_silence: false
+  max_duration: 10.0
+  min_duration: 0.1
+  is_tarred: false
+  tarred_audio_filepaths: null
+  shuffle_n: 2048
+  bucketing_strategy: synced_randomized
+  bucketing_batch_size:
+  - 34
+  - 30
+  - 26
+  - 22
+  - 18
+  - 16
+  - 12
+  - 8
+validation_ds:
+  manifest_filepath:
+  - /manifests/librispeech/librivox-dev-other.json #TBD
+  sample_rate: 16000
+  batch_size: 32
+  shuffle: false
+  num_workers: 8
+  pin_memory: true
+  use_start_end_token: false
+test_ds:
+  manifest_filepath:
+  - /manifests/librispeech/librivox-dev-other.json #TBD
+  sample_rate: 16000
+  batch_size: 32
+  shuffle: false
+  num_workers: 8
+  pin_memory: true
+  use_start_end_token: false
+tokenizer:
+  dir: /tokenizers/NeMo_ASR_SET/English/asr_set_3.0/tokenizer_spe_unigram_v128
+  type: bpe
+  model_path: nemo:e06949b0b85a485e9f280ea6d19e5492_tokenizer.model
+  vocab_path: nemo:53bbc634b62446de83525753e95a50ac_vocab.txt
+  spe_tokenizer_vocab: nemo:ff63e3c43c5f4b95bff702425366a4a6_tokenizer.vocab
+preprocessor:
+  _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+  sample_rate: 16000
+  normalize: per_feature
+  window_size: 0.025
+  window_stride: 0.01
+  window: hann
+  features: 80
+  n_fft: 512
+  log: true
+  frame_splicing: 1
+  dither: 1.0e-05
+  pad_to: 0
+  pad_value: 0.0
+spec_augment:
+  _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+  freq_masks: 2
+  time_masks: 10
+  freq_width: 27
+  time_width: 0.05
+# encoder:
+#   _target_: nemo.collections.asr.modules.ConformerEncoder
+#   feat_in: 80
+#   feat_out: -1
+#   n_layers: 18
+#   d_model: 512
+#   subsampling: striding
+#   subsampling_factor: 4
+#   subsampling_conv_channels: 512
+#   ff_expansion_factor: 4
+#   self_attention_model: rel_pos
+#   n_heads: 8
+#   att_context_size:
+#   - -1
+#   - -1
+#   xscaling: true
+#   untie_biases: true
+#   pos_emb_max_len: 5000
+#   conv_kernel_size: 31
+#   conv_norm_type: batch_norm
+#   dropout: 0.1
+#   dropout_emb: 0.0
+#   dropout_att: 0.1
+
+av_enocder:
+  d_model: 512
+  nhead: 4
+  num_layers: 2
+  dropout: 0.1
+
+adapters:
+  #TBD
+  
+decoder: # Keep it thse same as you are going by same decoder, note it has dimension.
+  _target_: nemo.collections.asr.modules.ConvASRDecoder
+  feat_in: 512
+  num_classes: 128
+  vocabulary:
+  - <unk>
+  - ▁
+  - s
+  - t
+  - e
+  - d
+  - o
+  - ▁the
+  - a
+  - i
+  - ▁a
+  - u
+  - 'y'
+  - m
+  - l
+  - 'n'
+  - p
+  - re
+  - c
+  - h
+  - r
+  - ▁s
+  - g
+  - ▁to
+  - er
+  - ing
+  - f
+  - ▁and
+  - an
+  - ▁i
+  - k
+  - ▁that
+  - ''''
+  - ▁of
+  - ▁in
+  - w
+  - ▁p
+  - ed
+  - or
+  - al
+  - ar
+  - ▁f
+  - en
+  - in
+  - b
+  - ▁you
+  - ▁w
+  - ▁b
+  - le
+  - ll
+  - es
+  - ▁it
+  - ve
+  - ur
+  - ▁we
+  - ▁re
+  - ▁be
+  - ly
+  - ▁is
+  - ▁he
+  - ▁o
+  - ▁c
+  - it
+  - ▁n
+  - ▁on
+  - un
+  - ▁t
+  - 'on'
+  - se
+  - th
+  - ce
+  - ▁do
+  - ic
+  - ▁for
+  - ▁th
+  - ion
+  - ch
+  - ▁was
+  - ri
+  - ent
+  - ▁g
+  - ver
+  - ▁co
+  - li
+  - ▁ha
+  - ▁ma
+  - la
+  - ro
+  - v
+  - us
+  - ▁ca
+  - ▁di
+  - ▁this
+  - ra
+  - ▁st
+  - ▁e
+  - ▁not
+  - ▁so
+  - ▁de
+  - ▁have
+  - ter
+  - ir
+  - ▁go
+  - ation
+  - ▁with
+  - ate
+  - ▁me
+  - ▁mo
+  - ment
+  - ▁con
+  - ▁but
+  - vi
+  - ▁pro
+  - ▁ho
+  - j
+  - ▁com
+  - ight
+  - ▁know
+  - ▁what
+  - ect
+  - ▁ex
+  - ▁some
+  - ▁would
+  - ▁like
+  - x
+  - ▁his
+  - q
+  - z
+optim:
+  name: adamw
+  lr: 2.0
+  betas:
+  - 0.9
+  - 0.98
+  weight_decay: 0.001
+  sched:
+    name: NoamAnnealing
+    d_model: 512
+    warmup_steps: 10000
+    warmup_ratio: null
+    min_lr: 1.0e-06
+compute_eval_loss: false
+variational_noise:
+  start_step: 0
+  std: 0.0
+target: nemo.collections.asr.models.av_ctc_bpe_models.AV_EncDecCTCModelBPE
+nemo_version: 1.9.0rc0
diff --git a/balu_codes/model_config_from_transcribe_py.yaml b/balu_codes/model_config_from_transcribe_py.yaml
new file mode 100644
index 000000000000..4951a0f966a1
--- /dev/null
+++ b/balu_codes/model_config_from_transcribe_py.yaml
@@ -0,0 +1,269 @@
+sample_rate: 16000
+log_prediction: true
+ctc_reduction: mean_batch
+skip_nan_grad: false
+train_ds:
+  manifest_filepath:
+  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/tarred_audio_manifest.json
+  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket2/tarred_audio_manifest.json
+  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket3/tarred_audio_manifest.json
+  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket4/tarred_audio_manifest.json
+  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket5/tarred_audio_manifest.json
+  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket6/tarred_audio_manifest.json
+  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket7/tarred_audio_manifest.json
+  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket8/tarred_audio_manifest.json
+  sample_rate: 16000
+  batch_size: 1
+  shuffle: true
+  num_workers: 4
+  pin_memory: true
+  use_start_end_token: false
+  trim_silence: false
+  max_duration: 10.0
+  min_duration: 0.1
+  is_tarred: true
+  tarred_audio_filepaths:
+  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/audio__OP_0..8191_CL_.tar
+  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket2/audio__OP_0..8191_CL_.tar
+  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket3/audio__OP_0..8191_CL_.tar
+  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket4/audio__OP_0..8191_CL_.tar
+  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket5/audio__OP_0..8191_CL_.tar
+  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket6/audio__OP_0..8191_CL_.tar
+  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket7/audio__OP_0..8191_CL_.tar
+  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket8/audio__OP_0..8191_CL_.tar
+  shuffle_n: 2048
+  bucketing_strategy: synced_randomized
+  bucketing_batch_size:
+  - 34
+  - 30
+  - 26
+  - 22
+  - 18
+  - 16
+  - 12
+  - 8
+validation_ds:
+  manifest_filepath:
+  - /manifests/librispeech/librivox-dev-other.json
+  - /manifests/librispeech/librivox-dev-clean.json
+  - /manifests/librispeech/librivox-test-other.json
+  - /manifests/librispeech/librivox-test-clean.json
+  sample_rate: 16000
+  batch_size: 32
+  shuffle: false
+  num_workers: 8
+  pin_memory: true
+  use_start_end_token: false
+test_ds:
+  manifest_filepath:
+  - /manifests/librispeech/librivox-dev-other.json
+  - /manifests/librispeech/librivox-dev-clean.json
+  - /manifests/librispeech/librivox-test-other.json
+  - /manifests/librispeech/librivox-test-clean.json
+  sample_rate: 16000
+  batch_size: 32
+  shuffle: false
+  num_workers: 8
+  pin_memory: true
+  use_start_end_token: false
+tokenizer:
+  dir: /tokenizers/NeMo_ASR_SET/English/asr_set_3.0/tokenizer_spe_unigram_v128
+  type: bpe
+  model_path: nemo:e06949b0b85a485e9f280ea6d19e5492_tokenizer.model
+  vocab_path: nemo:53bbc634b62446de83525753e95a50ac_vocab.txt
+  spe_tokenizer_vocab: nemo:ff63e3c43c5f4b95bff702425366a4a6_tokenizer.vocab
+preprocessor:
+  _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+  sample_rate: 16000
+  normalize: per_feature
+  window_size: 0.025
+  window_stride: 0.01
+  window: hann
+  features: 80
+  n_fft: 512
+  log: true
+  frame_splicing: 1
+  dither: 1.0e-05
+  pad_to: 0
+  pad_value: 0.0
+spec_augment:
+  _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+  freq_masks: 2
+  time_masks: 10
+  freq_width: 27
+  time_width: 0.05
+encoder:
+  _target_: nemo.collections.asr.modules.ConformerEncoder
+  feat_in: 80
+  feat_out: -1
+  n_layers: 18
+  d_model: 512
+  subsampling: striding
+  subsampling_factor: 4
+  subsampling_conv_channels: 512
+  ff_expansion_factor: 4
+  self_attention_model: rel_pos
+  n_heads: 8
+  att_context_size:
+  - -1
+  - -1
+  xscaling: true
+  untie_biases: true
+  pos_emb_max_len: 5000
+  conv_kernel_size: 31
+  conv_norm_type: batch_norm
+  dropout: 0.1
+  dropout_emb: 0.0
+  dropout_att: 0.1
+decoder:
+  _target_: nemo.collections.asr.modules.ConvASRDecoder
+  feat_in: 512
+  num_classes: 128
+  vocabulary:
+  - <unk>
+  - ▁
+  - s
+  - t
+  - e
+  - d
+  - o
+  - ▁the
+  - a
+  - i
+  - ▁a
+  - u
+  - 'y'
+  - m
+  - l
+  - 'n'
+  - p
+  - re
+  - c
+  - h
+  - r
+  - ▁s
+  - g
+  - ▁to
+  - er
+  - ing
+  - f
+  - ▁and
+  - an
+  - ▁i
+  - k
+  - ▁that
+  - ''''
+  - ▁of
+  - ▁in
+  - w
+  - ▁p
+  - ed
+  - or
+  - al
+  - ar
+  - ▁f
+  - en
+  - in
+  - b
+  - ▁you
+  - ▁w
+  - ▁b
+  - le
+  - ll
+  - es
+  - ▁it
+  - ve
+  - ur
+  - ▁we
+  - ▁re
+  - ▁be
+  - ly
+  - ▁is
+  - ▁he
+  - ▁o
+  - ▁c
+  - it
+  - ▁n
+  - ▁on
+  - un
+  - ▁t
+  - 'on'
+  - se
+  - th
+  - ce
+  - ▁do
+  - ic
+  - ▁for
+  - ▁th
+  - ion
+  - ch
+  - ▁was
+  - ri
+  - ent
+  - ▁g
+  - ver
+  - ▁co
+  - li
+  - ▁ha
+  - ▁ma
+  - la
+  - ro
+  - v
+  - us
+  - ▁ca
+  - ▁di
+  - ▁this
+  - ra
+  - ▁st
+  - ▁e
+  - ▁not
+  - ▁so
+  - ▁de
+  - ▁have
+  - ter
+  - ir
+  - ▁go
+  - ation
+  - ▁with
+  - ate
+  - ▁me
+  - ▁mo
+  - ment
+  - ▁con
+  - ▁but
+  - vi
+  - ▁pro
+  - ▁ho
+  - j
+  - ▁com
+  - ight
+  - ▁know
+  - ▁what
+  - ect
+  - ▁ex
+  - ▁some
+  - ▁would
+  - ▁like
+  - x
+  - ▁his
+  - q
+  - z
+optim:
+  name: adamw
+  lr: 2.0
+  betas:
+  - 0.9
+  - 0.98
+  weight_decay: 0.001
+  sched:
+    name: NoamAnnealing
+    d_model: 512
+    warmup_steps: 10000
+    warmup_ratio: null
+    min_lr: 1.0e-06
+compute_eval_loss: false
+variational_noise:
+  start_step: 0
+  std: 0.0
+target: nemo.collections.asr.models.ctc_bpe_models.EncDecCTCModelBPE
+nemo_version: 1.9.0rc0
diff --git a/balu_codes/testing_av_code.ipynb b/balu_codes/testing_av_code.ipynb
new file mode 100644
index 000000000000..a3a52c6b0971
--- /dev/null
+++ b/balu_codes/testing_av_code.ipynb
@@ -0,0 +1,431 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/workspace/nemo/NeMo-opensource/nemo/collections/asr/__init__.py\n",
+      "/workspace/nemo/NeMo-opensource/nemo/core/__init__.py\n",
+      "/workspace/nemo/NeMo-opensource/nemo/__init__.py\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/__init__.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "\n",
+    "# Insert local paths at the beginning of sys.path\n",
+    "sys.path.insert(0, os.path.abspath('/workspace/nemo/NeMo-opensource/'))\n",
+    "\n",
+    "import nemo.collections.asr as nemo_asr\n",
+    "print(nemo_asr.__file__)\n",
+    "import nemo.core as nemo_core\n",
+    "print(nemo_core.__file__)\n",
+    "from nemo.core import adapter_mixins\n",
+    "import nemo\n",
+    "print(nemo.__file__)\n",
+    "import lightning\n",
+    "print(lightning.__file__)\n",
+    "# Restore the site-packages paths\n",
+    "# sys.path.extend(site_packages_paths)\n",
+    "\n",
+    "import torch\n",
+    "from omegaconf import OmegaConf, open_dict\n",
+    "from pytorch_lightning import Trainer\n",
+    "from lightning.pytorch.loggers import WandbLogger\n",
+    "wandb_logger = WandbLogger(project=\"NEMO_TEST\")\n",
+    "# import nemo.collections.asr as nemo_asr"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-07-08 23:55:14 collections:317] Dataset loaded with 18351 files totalling 50.98 hours\n",
+      "[NeMo I 2024-07-08 23:55:14 collections:319] 0 files were filtered totalling 0.00 hours\n",
+      "[NeMo I 2024-07-08 23:55:14 collections:317] Dataset loaded with 18351 files totalling 50.98 hours\n",
+      "[NeMo I 2024-07-08 23:55:14 collections:319] 0 files were filtered totalling 0.00 hours\n",
+      "[NeMo I 2024-07-08 23:55:15 collections:317] Dataset loaded with 18351 files totalling 50.98 hours\n",
+      "[NeMo I 2024-07-08 23:55:15 collections:319] 0 files were filtered totalling 0.00 hours\n",
+      "[NeMo I 2024-07-08 23:55:15 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_2.0.0rc0/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo.\n",
+      "[NeMo I 2024-07-08 23:55:15 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_2.0.0rc0/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo\n",
+      "[NeMo I 2024-07-08 23:55:15 common:815] Instantiating model from pre-trained checkpoint\n",
+      "[NeMo I 2024-07-08 23:55:16 features:305] PADDING: 16\n",
+      "[NeMo I 2024-07-08 23:55:17 save_restore_connector:263] Model EncDecCTCModel was successfully restored from /root/.cache/torch/NeMo/NeMo_2.0.0rc0/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo.\n"
+     ]
+    }
+   ],
+   "source": [
+    "TRAIN_MANIFEST = \"/disk1/it1/annotations/manifest_train.json\"\n",
+    "TEST_MANIFEST = \"/disk1/it1/annotations/manifest_train.json\"\n",
+    "override_config_file_path = \"/workspace/nemo/NeMo-opensource/balu_codes/ctc_model_QuartzNet15x5Base copy.yaml\"\n",
+    "conf = OmegaConf.load(override_config_file_path)\n",
+    "OmegaConf.set_struct(conf, True)\n",
+    "model = nemo_asr.models.AV_EncDecCTCModel(conf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-07-08 23:55:18 collections:317] Dataset loaded with 18351 files totalling 50.98 hours\n",
+      "[NeMo I 2024-07-08 23:55:18 collections:319] 0 files were filtered totalling 0.00 hours\n"
+     ]
+    }
+   ],
+   "source": [
+    "with open_dict(model.cfg):\n",
+    "  # Train Dataloader\n",
+    "  model.cfg.train_ds.manifest_filepath = TRAIN_MANIFEST\n",
+    "  model.cfg.train_ds.batch_size = 32\n",
+    "  model.cfg.train_ds.is_tarred = False\n",
+    "  model.cfg.train_ds.tarred_audio_filepaths = None\n",
+    "\n",
+    "  model.cfg.validation_ds.manifest_filepath = TEST_MANIFEST\n",
+    "  model.cfg.validation_ds.batch_size = 32\n",
+    "\n",
+    "model.setup_training_data(model.cfg.train_ds)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# model.summarize()\n",
+    "model.freeze()\n",
+    "# model.summarize()\n",
+    "modules_to_train = [model.a_linear, model.v_linear, model.av_encoder, model.av_enocder_layer, model.a_modal_embs\n",
+    "                    , model.v_modal_embs, model.decoder]\n",
+    "for module in modules_to_train:\n",
+    "    module.train()\n",
+    "    for param in module.parameters():\n",
+    "        param.requires_grad = True\n",
+    "    \n",
+    "\n",
+    "# model.summarize()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO: GPU available: True (cuda), used: True\n",
+      "WARNING: Logging before flag parsing goes to stderr.\n",
+      "I0708 23:55:19.239430 134716664817472 rank_zero.py:64] GPU available: True (cuda), used: True\n",
+      "INFO: TPU available: False, using: 0 TPU cores\n",
+      "I0708 23:55:19.266383 134716664817472 rank_zero.py:64] TPU available: False, using: 0 TPU cores\n",
+      "INFO: HPU available: False, using: 0 HPUs\n",
+      "I0708 23:55:19.267290 134716664817472 rank_zero.py:64] HPU available: False, using: 0 HPUs\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-07-08 23:55:19 exp_manager:396] Experiments will be logged at test_experiments/test_wpe_quartz/2024-07-08_23-55-19\n",
+      "[NeMo I 2024-07-08 23:55:19 exp_manager:856] TensorboardLogger has been set up\n",
+      "[NeMo I 2024-07-08 23:55:19 exp_manager:871] WandBLogger has been set up\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[NeMo W 2024-07-08 23:55:19 exp_manager:966] The checkpoint callback was told to monitor a validation value and trainer's max_steps was set to 3000. Please ensure that max_steps will run for at least 3 epochs to ensure that checkpointing will not error out.\n"
+     ]
+    }
+   ],
+   "source": [
+    "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n",
+    "max_steps = 3000\n",
+    "\n",
+    "trainer = Trainer(devices=1, accelerator=accelerator, max_steps=max_steps,\n",
+    "                  enable_checkpointing=False, logger=False,\n",
+    "                  log_every_n_steps=5, check_val_every_n_epoch=3)\n",
+    "\n",
+    "model.set_trainer(trainer)\n",
+    "\n",
+    "# Prepare NeMo's Experiment manager to handle checkpoint saving and logging for us\n",
+    "from nemo.utils import exp_manager\n",
+    "\n",
+    "\n",
+    "# Environment variable generally used for multi-node multi-gpu training.\n",
+    "# In notebook environments, this flag is unnecessary and can cause logs of multiple training runs to overwrite each other.\n",
+    "os.environ.pop('NEMO_EXPM_VERSION', None)\n",
+    "\n",
+    "exp_config = exp_manager.ExpManagerConfig(\n",
+    "    exp_dir=f'test_experiments/',\n",
+    "    name=f\"test_wpe_quartz\",\n",
+    "    checkpoint_callback_params=exp_manager.CallbackParams(\n",
+    "        monitor=\"val_wer\",\n",
+    "        mode=\"min\",\n",
+    "        always_save_nemo=True,\n",
+    "        save_best_model=True,\n",
+    "    ),\n",
+    "    create_wandb_logger=True,\n",
+    "    wandb_logger_kwargs=OmegaConf.create({\"project\": \"NEMO_TEST\", \"name\": \"test_wpe_quartz\", \"log_model\":\"all\"}),\n",
+    ")\n",
+    "\n",
+    "exp_config = OmegaConf.structured(exp_config)\n",
+    "\n",
+    "logdir = exp_manager.exp_manager(trainer, exp_config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "E0708 23:55:20.059351 134716664817472 jupyter.py:224] Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mlakshmipathi-balaji\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.17.4"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>test_experiments/wandb/run-20240708_235520-2024-07-08_23-55-19</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/lakshmipathi-balaji/NEMO_TEST/runs/2024-07-08_23-55-19' target=\"_blank\">test_wpe_quartz</a></strong> to <a href='https://wandb.ai/lakshmipathi-balaji/NEMO_TEST' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/lakshmipathi-balaji/NEMO_TEST' target=\"_blank\">https://wandb.ai/lakshmipathi-balaji/NEMO_TEST</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/lakshmipathi-balaji/NEMO_TEST/runs/2024-07-08_23-55-19' target=\"_blank\">https://wandb.ai/lakshmipathi-balaji/NEMO_TEST/runs/2024-07-08_23-55-19</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-07-08 23:55:21 modelPT:767] Optimizer config = Novograd (\n",
+      "    Parameter Group 0\n",
+      "        amsgrad: False\n",
+      "        betas: [0.8, 0.5]\n",
+      "        eps: 1e-08\n",
+      "        grad_averaging: False\n",
+      "        lr: 0.01\n",
+      "        weight_decay: 0.001\n",
+      "    )\n",
+      "[NeMo I 2024-07-08 23:55:21 lr_scheduler:772] Scheduler not initialized as no `sched` config supplied to setup_optimizer()\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "  | Name             | Type                    | Params | Mode \n",
+      "---------------------------------------------------------------------\n",
+      "0 | a_model          | EncDecCTCModel          | 18.9 M | eval \n",
+      "1 | a_linear         | Linear                  | 524 K  | train\n",
+      "2 | v_linear         | Linear                  | 262 K  | train\n",
+      "3 | av_enocder_layer | TransformerEncoderLayer | 3.2 M  | train\n",
+      "4 | av_encoder       | TransformerEncoder      | 6.3 M  | train\n",
+      "5 | a_modal_embs     | Embedding               | 512    | train\n",
+      "6 | v_modal_embs     | Embedding               | 512    | train\n",
+      "7 | decoder          | ConvASRDecoder          | 14.9 K | train\n",
+      "8 | loss             | CTCLoss                 | 0      | eval \n",
+      "9 | wer              | WER                     | 0      | eval \n",
+      "---------------------------------------------------------------------\n",
+      "10.3 M    Trainable params\n",
+      "18.9 M    Non-trainable params\n",
+      "29.2 M    Total params\n",
+      "116.740   Total estimated model params size (MB)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0ea39ad975454155a69aa466a018c9bc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Sanity Checking: |          | 0/? [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[NeMo W 2024-07-08 23:55:21 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.\n",
+      "    \n",
+      "[NeMo W 2024-07-08 23:55:22 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.\n",
+      "    \n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "91a81cca20ec4a6a8abc39f8a4b882dd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Training: |          | 0/? [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-07-08 23:55:22 preemption:56] Preemption requires torch distributed to be initialized, disabling preemption\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "76ec198837cb47a5a0d217f8bba173c3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Validation: |          | 0/? [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO: Epoch 2, global step 1722: 'val_wer' reached 0.26906 (best 0.26906), saving model to '/workspace/nemo/NeMo-opensource/balu_codes/test_experiments/test_wpe_quartz/2024-07-08_23-55-19/checkpoints/test_wpe_quartz--val_wer=0.2691-epoch=2.ckpt' as top 3\n",
+      "I0709 00:11:43.277340 134716664817472 rank_zero.py:64] Epoch 2, global step 1722: 'val_wer' reached 0.26906 (best 0.26906), saving model to '/workspace/nemo/NeMo-opensource/balu_codes/test_experiments/test_wpe_quartz/2024-07-08_23-55-19/checkpoints/test_wpe_quartz--val_wer=0.2691-epoch=2.ckpt' as top 3\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-07-09 00:11:50 nemo_model_checkpoint:217] New best .nemo model saved to: /workspace/nemo/NeMo-opensource/balu_codes/test_experiments/test_wpe_quartz/2024-07-08_23-55-19/checkpoints/test_wpe_quartz.nemo\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO: `Trainer.fit` stopped: `max_steps=3000` reached.\n",
+      "I0709 00:18:02.561257 134716664817472 rank_zero.py:64] `Trainer.fit` stopped: `max_steps=3000` reached.\n",
+      "INFO: Restoring states from the checkpoint path at /workspace/nemo/NeMo-opensource/balu_codes/test_experiments/test_wpe_quartz/2024-07-08_23-55-19/checkpoints/test_wpe_quartz--val_wer=0.2691-epoch=2.ckpt\n",
+      "I0709 00:18:04.404496 134716664817472 rank_zero.py:64] Restoring states from the checkpoint path at /workspace/nemo/NeMo-opensource/balu_codes/test_experiments/test_wpe_quartz/2024-07-08_23-55-19/checkpoints/test_wpe_quartz--val_wer=0.2691-epoch=2.ckpt\n",
+      "INFO: Restored all states from the checkpoint at /workspace/nemo/NeMo-opensource/balu_codes/test_experiments/test_wpe_quartz/2024-07-08_23-55-19/checkpoints/test_wpe_quartz--val_wer=0.2691-epoch=2.ckpt\n",
+      "I0709 00:18:04.564626 134716664817472 rank_zero.py:64] Restored all states from the checkpoint at /workspace/nemo/NeMo-opensource/balu_codes/test_experiments/test_wpe_quartz/2024-07-08_23-55-19/checkpoints/test_wpe_quartz--val_wer=0.2691-epoch=2.ckpt\n"
+     ]
+    }
+   ],
+   "source": [
+    "trainer.fit(model)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/balu_codes/transcribe.py b/balu_codes/transcribe.py
index 0decf8696802..73ad418ac2d7 100644
--- a/balu_codes/transcribe.py
+++ b/balu_codes/transcribe.py
@@ -1,13 +1,12 @@
 # import nemo.collections.asr as nemo_asr
 import sys
 import os
-sys.path.append(os.path.abspath('/workspace/nemo/NeMo-opensource/nemo/collections'))
-sys.path.append(os.path.abspath('/workspace/nemo/NeMo-opensource/nemo'))
-import asr as nemo_asr
+sys.path.append(os.path.abspath('/workspace/nemo/NeMo-opensource'))
+import nemo.collections.asr as nemo_asr
 
 def load_model(model_name):
     model = nemo_asr.models.ASRModel.from_pretrained(model_name)
     return model
-
-model = load_model("stt_en_conformer_ctc_large")
-model.transcribe(["/disk1/mixed_dataset_000/mixed_audios/train/08zhZZn29jc_496fe_1997_Peters_Township_High_School_Commencement_SLASH_1997_Peters_Township_High_School_Commencement_DOT_mp3_00010.wav"])
\ No newline at end of file
+# model = load_model("stt_en_conformer_ctc_large")
+model = load_model("QuartzNet15x5Base-En")
+model.transcribe(["/disk1/it1/mixed_audios/009LTXtP4vE_c053b1_171114BCPC_SLASH_171114-BC-PC_DOT_mp3_00035.wav"])
\ No newline at end of file
diff --git a/nemo/collections/asr/data/av_to_text.py b/nemo/collections/asr/data/av_to_text.py
index 066bbad340de..8a1e33d865b6 100644
--- a/nemo/collections/asr/data/av_to_text.py
+++ b/nemo/collections/asr/data/av_to_text.py
@@ -106,12 +106,11 @@ def _speech_collate_fn(batch, pad_id):
     tokens = torch.stack(tokens)
     tokens_lengths = torch.stack(tokens_lengths)
     if sample_ids is None:
-        return audio_signal, audio_lengths, tokens, tokens_lengths
+        return audio_signal, audio_lengths, video_feat_signal, tokens, tokens_lengths
     else:
         sample_ids = torch.tensor(sample_ids, dtype=torch.int32)
         return audio_signal, audio_lengths, video_feat_signal, tokens, tokens_lengths, sample_ids
 
-
 class ASR_AV_ManifestProcessor:
     """
     Class that processes a manifest json file containing paths to audio files, transcripts, and durations (in seconds).
@@ -345,12 +344,12 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]:
         """Returns definitions of module output ports.
                """
         return {
-            'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
-            'a_sig_length': NeuralType(tuple('B'), LengthsType()),
-            'video_signal': NeuralType(('B', 'F', 'D'), ImageFeatureValue()),
-            'transcripts': NeuralType(('B', 'T'), LabelsType()),
-            'transcript_length': NeuralType(tuple('B'), LengthsType()),
-            'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True),
+            'audio_signal': [NeuralType(('B', 'T'), AudioSignal())],
+            'a_sig_length': [NeuralType(tuple('B'), LengthsType())],
+            'video_input_signal': [NeuralType(('B', 'T', 'D'), ChannelType())],
+            'transcripts': [NeuralType(('B', 'T'), LabelsType())],
+            'transcript_length': [NeuralType(tuple('B'), LengthsType())],
+            'sample_id': [NeuralType(tuple('B'), LengthsType(), optional=True)],
         }
 
     def __init__(
@@ -369,7 +368,7 @@ def __init__(
         pad_id: int = 0,
         return_sample_id: bool = False,
         channel_selector: Optional[ChannelSelectorType] = None,
-        video_frame_rate: int = 3,
+        video_frame_rate: int = 5,
     ):
         if type(manifest_filepath) == str:
             manifest_filepath = manifest_filepath.split(",")
@@ -428,15 +427,18 @@ def _process_sample(self, index):
         vf = np.load(sample.video_featfile)
         # uniformly sample self.video_frame_rate frames from video at shape 0.
         # TODO: @Balu, how would you do this, if you one frame rate then you should make many dirs with different frame rates.
-        assert vf.shape[0] == self.video_frame_rate, f"Video feature file {sample.video_featfile} has {vf.shape[0]} frame_feats, expected {self.video_frame_rate}"
+        assert vf.shape[0] == self.video_frame_rate*sample.duration, f"Video feature file {sample.video_featfile} has {vf.shape[0]} frame_feats, expected {self.video_frame_rate}"
 
         t, tl = self.manifest_processor.process_text_by_sample(sample=sample)
 
+        vf = torch.from_numpy(vf)
+        # make it torch float
+        vf = vf.float()
         if self.return_sample_id:
-            output = f, fl, torch.from_numpy(vf), torch.tensor(
+            output = f, fl, vf, torch.tensor(
                 t).long(), torch.tensor(tl).long(), index
         else:
-            output = f, fl, torch.from_numpy(vf), torch.tensor(
+            output = f, fl, vf, torch.tensor(
                 t).long(), torch.tensor(tl).long()
 
         return output
@@ -489,7 +491,7 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]:
         return {
             'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
             'a_sig_length': NeuralType(tuple('B'), LengthsType()),
-            'video_signal': NeuralType(('B', 'F', 'D'), ImageFeatureValue()),
+            'hidden_states': NeuralType(('B', 'T', 'D'), EncodedRepresentation()),
             'transcripts': NeuralType(('B', 'T'), LabelsType()),
             'transcript_length': NeuralType(tuple('B'), LengthsType()),
             'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True),
@@ -586,6 +588,7 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]:
         return {
             'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
             'a_sig_length': NeuralType(tuple('B'), LengthsType()),
+            'video_input_signal': NeuralType(('B', 'T', 'D'), ChannelType()),
             'transcripts': NeuralType(('B', 'T'), LabelsType()),
             'transcript_length': NeuralType(tuple('B'), LengthsType()),
             'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True),
diff --git a/nemo/collections/asr/models/__init__.py b/nemo/collections/asr/models/__init__.py
index 23c759afc80d..538b3fcabcf6 100644
--- a/nemo/collections/asr/models/__init__.py
+++ b/nemo/collections/asr/models/__init__.py
@@ -23,6 +23,7 @@
 from nemo.collections.asr.models.clustering_diarizer import ClusteringDiarizer
 from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
 from nemo.collections.asr.models.ctc_models import EncDecCTCModel
+from nemo.collections.asr.models.av_ctc_models import AV_EncDecCTCModel
 from nemo.collections.asr.models.enhancement_models import (
     EncMaskDecAudioToAudioModel,
     PredictiveAudioToAudioModel,
diff --git a/nemo/collections/asr/models/av_ctc_models.py b/nemo/collections/asr/models/av_ctc_models.py
index 22f3a43f3b23..76d054ab0142 100644
--- a/nemo/collections/asr/models/av_ctc_models.py
+++ b/nemo/collections/asr/models/av_ctc_models.py
@@ -28,6 +28,7 @@
 from nemo.collections.asr.data.av_to_text import _AVTextDataset
 # from nemo.collections.asr.data.audio_to_text_dali import AudioToCharDALIDataset, DALIOutputs
 # from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset
+from nemo.collections.asr.models.ctc_models import EncDecCTCModel
 from nemo.collections.asr.losses.ctc import CTCLoss
 from nemo.collections.asr.metrics.wer import WER
 from nemo.collections.asr.models.asr_model import ASRModel, ExportableEncDecModel
@@ -43,10 +44,10 @@
 from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, LogprobsType, NeuralType, SpectrogramType, ImageFeatureValue
 from nemo.utils import logging
 
-__all__ = ['EncDecCTCModel']
+__all__ = ['AV_EncDecCTCModel']
 
 
-class EncDecCTCModel(ASRModel, ExportableEncDecModel, ASRModuleMixin, InterCTCMixin, ASRTranscriptionMixin):
+class AV_EncDecCTCModel(ASRModel, ExportableEncDecModel, ASRModuleMixin, InterCTCMixin, ASRTranscriptionMixin):
     """Base class for encoder decoder CTC-based models."""
 
     def __init__(self, cfg: DictConfig, trainer: Trainer = None):
@@ -56,7 +57,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         if trainer is not None:
             self.world_size = trainer.world_size
 
-        super().__init__(cfg=cfg.a_model, trainer=trainer)
+        super().__init__(cfg=cfg, trainer=trainer)
 
         self.a_model = EncDecCTCModel.from_pretrained(cfg.a_model_name)
         with open_dict(self._cfg):
@@ -76,18 +77,18 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
                 cfg.decoder["num_classes"] = len(self.cfg.decoder.vocabulary)
 
         # initialize a transformer encoder and decoder
-        self.a_linear = torch.nn.Linear(in_features = self.a_model.decoder._feat_in, out_features = self.cfg.av_encoder.d_model)
-        self.v_linear = torch.nn.Linear(in_features = self.cfg.v_model.decoder._feat_in, out_features = self.cfg.av_encoder.d_model)
+        self.a_linear = torch.nn.Linear(in_features = self.a_model.encoder._feat_out, out_features = self.cfg.av_encoder.d_model)
+        self.v_linear = torch.nn.Linear(in_features = self.cfg.v_model.feat_dim, out_features = self.cfg.av_encoder.d_model)
         self.av_enocder_layer = torch.nn.TransformerEncoderLayer(d_model = self.cfg.av_encoder.d_model, nhead = self.cfg.av_encoder.nhead, dropout = self.cfg.av_encoder.dropout, batch_first=True)
         self.av_encoder = torch.nn.TransformerEncoder(self.av_enocder_layer, num_layers = self.cfg.av_encoder.num_layers)
         
         # Modality embeddings
-        self.a_modal_embs = torch.nn.Embedding(1, self.cfg.av_decoder.d_model)
-        self.v_modal_embs = torch.nn.Embedding(1, self.cfg.av_decoder.d_model)
+        self.a_modal_embs = torch.nn.Embedding(1, self.cfg.av_encoder.d_model)
+        self.v_modal_embs = torch.nn.Embedding(1, self.cfg.av_encoder.d_model)
         
         # Trainable positional encodings
-        self.a_pos_enc = torch.nn.Embedding(10000, self.cfg.av_decoder.d_model)
-        self.v_pos_enc = torch.nn.Embedding(10000, self.cfg.av_decoder.d_model)
+        # self.a_pos_enc = torch.nn.Embedding(10000, self.cfg.decoder.feat_in)
+        # self.v_pos_enc = torch.nn.Embedding(10000, self.cfg.decoder.feat_in)
         
         # self.av_decoder_layer = torch.nn.TransformerDecoderLayer(d_model = self.cfg.av_decoder.d_model, nhead = self.cfg.av_decoder.nhead, dropout = self.cfg.av_decoder.dropout, batch_first=True)
         # self.av_decoder = torch.nn.TransformerDecoder(self.av_decoder_layer, num_layers = self.cfg.av_decoder.num_layers)
@@ -101,7 +102,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         )
         
         # Setup decoding objects
-        # decoding_cfg = self.cfg.get('decoding', None)
+        decoding_cfg = self.cfg.get('decoding', None)
 
         # In case decoding config not found, use default config
         if decoding_cfg is None:
@@ -427,14 +428,14 @@ def setup_test_data(self, test_data_config: Optional[Union[DictConfig, Dict]]):
 
     @property
     def input_types(self) -> Optional[Dict[str, NeuralType]]:
-        if hasattr(self.preprocessor, '_sample_rate'):
-            input_signal_eltype = AudioSignal(freq=self.preprocessor._sample_rate)
+        if hasattr(self.a_model.preprocessor, '_sample_rate'):
+            input_signal_eltype = AudioSignal(freq=self.a_model.preprocessor._sample_rate)
         else:
             input_signal_eltype = AudioSignal()
         return {
             "audio_input_signal": NeuralType(('B', 'T'), input_signal_eltype, optional=True),
             "audio_input_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True),
-            "video_input_signal": NeuralType(('B', 'F', 'D'), ImageFeatureValue(), optional=True),
+            "video_input_signal": NeuralType(('B', 'T', 'D'), ImageFeatureValue(), optional=True),
             "processed_signal": NeuralType(('B', 'D', 'T'), SpectrogramType(), optional=True),
             "processed_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True),
             "sample_id": NeuralType(tuple('B'), LengthsType(), optional=True),
@@ -450,7 +451,7 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]:
 
     @typecheck()
     def forward(
-        self, audio_input=None, audio_input_signal_length=None, video_input_signal= None, processed_signal=None, processed_signal_length=None
+        self, audio_input_signal=None, audio_input_signal_length=None, video_input_signal= None, processed_signal=None, processed_signal_length=None
     ):
         """
         Forward pass of the model.
@@ -472,7 +473,7 @@ def forward(
             2) The lengths of the acoustic sequence after propagation through the encoder, of shape [B].
             3) The greedy token predictions of the model of shape [B, T] (via argmax)
         """
-        has_input_signal = audio_input is not None and audio_input_signal_length is not None
+        has_input_signal = audio_input_signal is not None and audio_input_signal_length is not None
         has_processed_signal = processed_signal is not None and processed_signal_length is not None
         if (has_input_signal ^ has_processed_signal) == False:
             raise ValueError(
@@ -481,15 +482,16 @@ def forward(
             )
 
         if not has_processed_signal:
-            processed_signal, processed_signal_length = self.preprocessor(
-                audio_input=audio_input, length=audio_input_signal_length,
+            processed_signal, processed_signal_length = self.a_model.preprocessor(
+                input_signal=audio_input_signal, length=audio_input_signal_length,
             )
 
-        if self.spec_augmentation is not None and self.training:
-            processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length)
+        if self.a_model.spec_augmentation is not None and self.training:
+            processed_signal = self.a_model.spec_augmentation(input_spec=processed_signal, length=processed_signal_length)
 
-        encoder_output = self.encoder(audio_signal=processed_signal, length=processed_signal_length)
-        encoded = encoder_output[0]
+        encoder_output = self.a_model.encoder(audio_signal=processed_signal, length=processed_signal_length)
+        # B,C,T -> B,T,C
+        encoded = encoder_output[0].permute(0, 2, 1)
         encoded_len = encoder_output[1]
         a_encoded = self.a_linear(encoded)
         v_encoded = self.v_linear(video_input_signal)
@@ -498,17 +500,24 @@ def forward(
         B, T, C = a_encoded.size()
         B, F, D = v_encoded.size()
         assert C == D, "The audio and video features must have the same dimensionality"
-        a_encoded = a_encoded + self.a_modal_embs.to(a_encoded.device).repeat(B, T, 1)
-        v_encoded = v_encoded + self.v_modal_embs.to(v_encoded.device).repeat(B, F, 1)
+        
+        # Expand modality embeddings to match the dimensions of a_encoded and v_encoded
+        a_modal_emb_expanded = self.a_modal_embs.weight.expand(B, T, -1)  # Shape: (B, T, feat_in)
+        v_modal_emb_expanded = self.v_modal_embs.weight.expand(B, F, -1)  # Shape: (B, F, feat_in)
+        
+        a_encoded = a_encoded + a_modal_emb_expanded
+        v_encoded = v_encoded + v_modal_emb_expanded
         
         # Add positional encodings
-        a_encoded = a_encoded + self.a_pos_enc.to(a_encoded.device).repeat(B, T, 1)
-        v_encoded = v_encoded + self.v_pos_enc.to(v_encoded.device).repeat(B, F, 1)
+        # a_encoded = a_encoded + self.a_pos_enc.to(a_encoded.device).repeat(B, T, 1)
+        # v_encoded = v_encoded + self.v_pos_enc.to(v_encoded.device).repeat(B, F, 1)
         
         # Concat and pass them through the transformer encoder
         av_encoded = torch.cat((a_encoded, v_encoded), dim=1)
         av_encoded = self.av_encoder(av_encoded)
         
+        # B,T,C -> B,C,T
+        av_encoded = av_encoded.permute(0, 2, 1)
         log_probs = self.decoder(encoder_output=av_encoded)
         greedy_predictions = log_probs.argmax(dim=-1, keepdim=False)
 
@@ -533,7 +542,7 @@ def training_step(self, batch, batch_nb):
         #         processed_signal=signal, processed_signal_length=signal_len
         #     )
         # else:
-        log_probs, encoded_len, predictions = self.forward(audio_input=signal, audio_input_signal_length=signal_len, video_input_signal=video_input_signal)
+        log_probs, encoded_len, predictions = self.forward(audio_input_signal=signal, audio_input_signal_length=signal_len, video_input_signal=video_input_signal)
 
         if hasattr(self, '_trainer') and self._trainer is not None:
             log_every_n_steps = self._trainer.log_every_n_steps
@@ -583,7 +592,7 @@ def predict_step(self, batch, batch_idx, dataloader_idx=0):
         #         processed_signal=signal, processed_signal_length=signal_len
         #     )
         # else:
-        log_probs, encoded_len, predictions = self.forward(audio_input=signal, audio_input_signal_length=signal_len, video_input_signal=video_input_signal)
+        log_probs, encoded_len, predictions = self.forward(audio_input_signal=signal, audio_input_signal_length=signal_len, video_input_signal=video_input_signal)
 
         transcribed_texts, _ = self.wer.decoding.ctc_decoder_predictions_tensor(
             decoder_outputs=log_probs, decoder_lengths=encoded_len, return_hypotheses=False,
@@ -602,7 +611,7 @@ def validation_pass(self, batch, batch_idx, dataloader_idx=0):
         #         processed_signal=signal, processed_signal_length=signal_len
         #     )
         # else:
-        log_probs, encoded_len, predictions = self.forward(audio_input=signal, audio_input_signal_length=signal_len, video_input_signal=video_input_signal)
+        log_probs, encoded_len, predictions = self.forward(audio_input_signal=signal, audio_input_signal_length=signal_len, video_input_signal=video_input_signal)
 
         loss_value = self.loss(
             log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len
@@ -741,7 +750,7 @@ def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLo
 
         dl_config = {
             'manifest_filepath': manifest_filepath,
-            'sample_rate': self.preprocessor._sample_rate,
+            'sample_rate': self.a_model.preprocessor._sample_rate,
             'labels': OmegaConf.to_container(self.decoder.vocabulary),
             'batch_size': batch_size,
             'trim_silence': False,
diff --git a/nemo/collections/common/parts/preprocessing/collections.py b/nemo/collections/common/parts/preprocessing/collections.py
index 03a862a99ec5..f92a12cd6dde 100644
--- a/nemo/collections/common/parts/preprocessing/collections.py
+++ b/nemo/collections/common/parts/preprocessing/collections.py
@@ -471,7 +471,7 @@ def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs):
         )
 
 
-class ASR_AV_AudioText(AudioText):
+class ASR_AV_AudioText(AVText):
     """`AudioText` collector from asr structured json files."""
 
     def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs):
@@ -496,7 +496,7 @@ def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs):
         for item in manifest.item_iter(manifests_files):
             ids.append(item['id'])
             audio_files.append(item['audio_file'])
-            video_featfiles.append(item['video_featfile'])
+            video_featfiles.append(item['feature_file'])
             durations.append(item['duration'])
             texts.append(item['text'])
             offsets.append(item['offset'])
diff --git a/tutorials/asr/Streaming_ASR.ipynb b/tutorials/asr/Streaming_ASR.ipynb
index a4701dc025d8..0dac0f23dc31 100644
--- a/tutorials/asr/Streaming_ASR.ipynb
+++ b/tutorials/asr/Streaming_ASR.ipynb
@@ -1,48 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "lJz6FDU1lRzc"
-   },
-   "outputs": [],
-   "source": [
-    "\"\"\"\n",
-    "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
-    "\n",
-    "Instructions for setting up Colab are as follows:\n",
-    "1. Open a new Python 3 notebook.\n",
-    "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
-    "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
-    "4. Run this cell to set up dependencies.\n",
-    "5. Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect\n",
-    "\n\nNOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n",
-    "\"\"\"\n",
-    "# If you're using Google Colab and not running locally, run this cell.\n",
-    "\n",
-    "## Install dependencies\n",
-    "!pip install wget\n",
-    "!apt-get install sox libsndfile1 ffmpeg\n",
-    "!pip install text-unidecode\n",
-    "!pip install matplotlib>=3.3.2\n",
-    "\n",
-    "## Install NeMo\n",
-    "BRANCH = 'main'\n",
-    "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
-    "\n",
-    "## Grab the config we'll use in this example\n",
-    "!mkdir configs\n",
-    "!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/config.yaml\n",
-    "\n",
-    "\"\"\"\n",
-    "Remember to restart the runtime for the kernel to pick up any upgraded packages (e.g. matplotlib)!\n",
-    "Alternatively, you can uncomment the exit() below to crash and restart the kernel, in the case\n",
-    "that you want to use the \"Run All Cells\" (or similar) option.\n",
-    "\"\"\"\n",
-    "# exit()"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -83,33 +40,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "# If something goes wrong during data processing, un-comment the following line to delete the cached dataset \n",
     "# !rm -rf datasets/mini-dev-clean\n",
-    "!mkdir -p datasets/mini-dev-clean"
+    "!mkdir -p /disk1/ksingla/datasets/mini-dev-clean"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "100%|███████████████████████████████████████████| 38/38 [00:03<00:00, 12.53it/s]\n"
+     ]
+    }
+   ],
    "source": [
     "!python ../../scripts/dataset_processing/get_librispeech_data.py \\\n",
-    "  --data_root \"datasets/mini-dev-clean/\" \\\n",
+    "  --data_root \"/disk1/ksingla/datasets/mini-dev-clean/\" \\\n",
     "  --data_sets dev_clean_2"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
-    "manifest = \"datasets/mini-dev-clean/dev_clean_2.json\""
+    "manifest = \"/disk1/ksingla/datasets/mini-dev-clean/dev_clean_2.json\""
    ]
   },
   {
@@ -121,7 +86,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -144,13 +109,51 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers\n",
+      "  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)\n",
+      "  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-pocketsphinx --enable-librsvg --enable-libmfx --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared\n",
+      "  libavutil      56. 70.100 / 56. 70.100\n",
+      "  libavcodec     58.134.100 / 58.134.100\n",
+      "  libavformat    58. 76.100 / 58. 76.100\n",
+      "  libavdevice    58. 13.100 / 58. 13.100\n",
+      "  libavfilter     7.110.100 /  7.110.100\n",
+      "  libswscale      5.  9.100 /  5.  9.100\n",
+      "  libswresample   3.  9.100 /  3.  9.100\n",
+      "  libpostproc    55.  9.100 / 55.  9.100\n",
+      "\u001b[0;33mGuessed Channel Layout for Input Stream #0.0 : mono\n",
+      "\u001b[0mInput #0, concat, from 'concat_file.txt':\n",
+      "  Duration: N/A, start: 0.000000, bitrate: 256 kb/s\n",
+      "  Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 16000 Hz, mono, s16, 256 kb/s\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Output #0, wav, to '/disk1/ksingla/datasets/mini-dev-clean/concatenated_audio.wav':\n",
+      "  Metadata:\n",
+      "    ISFT            : Lavf58.76.100\n",
+      "  Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 16000 Hz, mono, s16, 256 kb/s\n",
+      "Stream mapping:\n",
+      "  Stream #0:0 -> #0:0 (copy)\n",
+      "Press [q] to stop, [?] for help\n",
+      "size=   28251kB time=00:15:03.96 bitrate= 256.0kbits/s speed= 898x    \n",
+      "video:0kB audio:28251kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.000270%\n",
+      "Finished concatenating audio file!\n"
+     ]
+    }
+   ],
    "source": [
     "new_duration, ref_transcript = concat_audio(manifest, 15*60)\n",
     "\n",
-    "concat_audio_path = \"datasets/mini-dev-clean/concatenated_audio.wav\"\n",
+    "concat_audio_path = \"/disk1/ksingla/datasets/mini-dev-clean/concatenated_audio.wav\"\n",
     "\n",
     "!ffmpeg -t {new_duration} -safe 0 -f concat -i concat_file.txt -c copy -t {new_duration} {concat_audio_path} -y\n",
     "print(\"Finished concatenating audio file!\")"
@@ -166,7 +169,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -178,11 +181,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'cpu'"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "device = 'cpu'\n",
     "device"
    ]
   },
@@ -195,22 +209,95 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-07-05 23:24:06 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.23.0/stt_en_conformer_ctc_small/5d2d8e5b2b5adb8f5091363c6ba19c55/stt_en_conformer_ctc_small.nemo.\n",
+      "[NeMo I 2024-07-05 23:24:06 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.23.0/stt_en_conformer_ctc_small/5d2d8e5b2b5adb8f5091363c6ba19c55/stt_en_conformer_ctc_small.nemo\n",
+      "[NeMo I 2024-07-05 23:24:06 common:924] Instantiating model from pre-trained checkpoint\n",
+      "[NeMo I 2024-07-05 23:24:07 mixins:172] Tokenizer SentencePieceTokenizer initialized with 1024 tokens\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[NeMo W 2024-07-05 23:24:07 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
+      "    Train config : \n",
+      "    manifest_filepath: /data/NeMo_ASR_SET/English/v2.0/train/tarred_audio_manifest.json\n",
+      "    sample_rate: 16000\n",
+      "    batch_size: 64\n",
+      "    shuffle: true\n",
+      "    num_workers: 8\n",
+      "    pin_memory: true\n",
+      "    use_start_end_token: false\n",
+      "    trim_silence: false\n",
+      "    max_duration: 20.0\n",
+      "    min_duration: 0.1\n",
+      "    shuffle_n: 2048\n",
+      "    is_tarred: true\n",
+      "    tarred_audio_filepaths: /data/NeMo_ASR_SET/English/v2.0/train/audio__OP_0..4095_CL_.tar\n",
+      "    \n",
+      "[NeMo W 2024-07-05 23:24:07 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
+      "    Validation config : \n",
+      "    manifest_filepath:\n",
+      "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-dev-other.json\n",
+      "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-dev-clean.json\n",
+      "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-test-other.json\n",
+      "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-test-clean.json\n",
+      "    sample_rate: 16000\n",
+      "    batch_size: 64\n",
+      "    shuffle: false\n",
+      "    num_workers: 8\n",
+      "    pin_memory: true\n",
+      "    use_start_end_token: false\n",
+      "    is_tarred: false\n",
+      "    tarred_audio_filepaths: na\n",
+      "    \n",
+      "[NeMo W 2024-07-05 23:24:07 modelPT:178] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a valid configuration file to setup the test data loader(s).\n",
+      "    Test config : \n",
+      "    manifest_filepath:\n",
+      "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-test-other.json\n",
+      "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-dev-clean.json\n",
+      "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-dev-other.json\n",
+      "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-test-clean.json\n",
+      "    sample_rate: 16000\n",
+      "    batch_size: 64\n",
+      "    shuffle: false\n",
+      "    num_workers: 8\n",
+      "    pin_memory: true\n",
+      "    use_start_end_token: false\n",
+      "    is_tarred: false\n",
+      "    tarred_audio_filepaths: na\n",
+      "    \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-07-05 23:24:07 features:289] PADDING: 0\n",
+      "[NeMo I 2024-07-05 23:24:07 save_restore_connector:249] Model EncDecCTCModelBPE was successfully restored from /root/.cache/torch/NeMo/NeMo_1.23.0/stt_en_conformer_ctc_small/5d2d8e5b2b5adb8f5091363c6ba19c55/stt_en_conformer_ctc_small.nemo.\n"
+     ]
+    }
+   ],
    "source": [
     "# Clear up memory\n",
-    "torch.cuda.empty_cache()\n",
-    "gc.collect()\n",
-    "model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(\"stt_en_conformer_ctc_large\", map_location=device)\n",
-    "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+    "#torch.cuda.empty_cache()\n",
+    "#gc.collect()\n",
+    "model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(\"stt_en_conformer_ctc_small\", map_location=device)\n",
+    "device = 'cpu'\n",
     "# device = 'cpu'  # You can transcribe even longer samples on the CPU, though it will take much longer !\n",
     "model = model.to(device)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -233,12 +320,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "EncDecCTCModel.transcribe() got an unexpected keyword argument 'device'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[11], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m autocast():\n\u001b[0;32m----> 2\u001b[0m     transcript \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtranscribe\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mconcat_audio_path\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcpu\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py:115\u001b[0m, in \u001b[0;36mcontext_decorator.<locals>.decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    112\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m    113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m    114\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 115\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mTypeError\u001b[0m: EncDecCTCModel.transcribe() got an unexpected keyword argument 'device'"
+     ]
+    }
+   ],
    "source": [
     "with autocast():\n",
-    "    transcript = model.transcribe([concat_audio_path], batch_size=1)[0]"
+    "    transcript = model.transcribe([concat_audio_path], batch_size=1, device=\"cpu\")[0]"
    ]
   },
   {
diff --git a/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb b/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
index ee5ae7392708..94b3897b6256 100644
--- a/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
+++ b/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
@@ -2,7 +2,7 @@
   "cells": [
     {
       "cell_type": "code",
-      "execution_count": 2,
+      "execution_count": 1,
       "metadata": {
         "id": "pEYsuj0J9pId"
       },
@@ -22,8 +22,6 @@
         "NOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.\n",
         "\"\"\"\n",
         "# If you're using Google Colab and not running locally, run this cell.\n",
-        "import os\n",
-        "import sys\n",
         "\n",
         "# Install dependencies\n",
         "# !pip install wget\n",
@@ -31,8 +29,7 @@
         "# !pip install text-unidecode\n",
         "# !pip install matplotlib>=3.3.2\n",
         "\n",
-        "sys.path.append(os.path.abspath('/workspace/nemo/NeMo-opensource/nemo/collections'))\n",
-        "sys.path.append(os.path.abspath('/workspace/nemo/NeMo-opensource/nemo'))\n",
+        "\n",
         "## Install NeMo\n",
         "BRANCH = 'main'\n",
         "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
@@ -42,6 +39,58 @@
         "# !wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml"
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "/workspace/nemo/NeMo-opensource/nemo/collections/asr/__init__.py\n",
+            "/workspace/nemo/NeMo-opensource/nemo/core/__init__.py\n",
+            "/workspace/nemo/NeMo-opensource/nemo/__init__.py\n"
+          ]
+        }
+      ],
+      "source": [
+        "import os\n",
+        "import sys\n",
+        "\n",
+        "\n",
+        "# Insert local paths at the beginning of sys.path\n",
+        "sys.path.insert(0, os.path.abspath('/workspace/nemo/NeMo-opensource/'))\n",
+        "\n",
+        "import nemo.collections.asr as nemo_asr\n",
+        "print(nemo_asr.__file__)\n",
+        "import nemo.core as nemo_core\n",
+        "print(nemo_core.__file__)\n",
+        "from nemo.core import adapter_mixins\n",
+        "import nemo\n",
+        "print(nemo.__file__)\n",
+        "# Restore the site-packages paths\n",
+        "# sys.path.extend(site_packages_paths)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.10/dist-packages/lightning/__init__.py\n"
+          ]
+        }
+      ],
+      "source": [
+        "import lightning\n",
+        "print(lightning.__file__)"
+      ]
+    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -143,7 +192,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 3,
+      "execution_count": 4,
       "metadata": {
         "id": "NpKgT6q5-gNk"
       },
@@ -160,7 +209,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 4,
+      "execution_count": 5,
       "metadata": {
         "id": "0wZZuUDi_gEV"
       },
@@ -235,7 +284,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 5,
+      "execution_count": 6,
       "metadata": {
         "id": "9fiqQeWDAXsH"
       },
@@ -263,39 +312,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 6,
+      "execution_count": 7,
       "metadata": {
         "id": "F-wt9y5iAali"
       },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "[NeMo W 2024-07-04 11:54:42 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/torch/cuda/__init__.py:626: UserWarning: Can't initialize NVML\n",
-            "      warnings.warn(\"Can't initialize NVML\")\n",
-            "    \n"
-          ]
-        }
-      ],
+      "outputs": [],
       "source": [
         "import torch\n",
         "from omegaconf import OmegaConf, open_dict\n",
         "from pytorch_lightning import Trainer\n",
-        "\n",
-        "# import nemo.collections.asr as nemo_asr\n",
-        "import asr as nemo_asr"
+        "from lightning.pytorch.loggers import WandbLogger\n",
+        "wandb_logger = WandbLogger(project=\"NEMO_TEST\")\n",
+        "# import nemo.collections.asr as nemo_asr"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 7,
+      "execution_count": 8,
       "metadata": {
         "id": "uVOfU7gsCI5u"
       },
       "outputs": [],
       "source": [
-        "model_name = \"stt_en_conformer_ctc_small\""
+        "model_name = \"stt_en_conformer_ctc_large\""
       ]
     },
     {
@@ -326,7 +365,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 8,
+      "execution_count": 9,
       "metadata": {
         "id": "RzwLAHVqAqD9"
       },
@@ -335,9 +374,9 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "[NeMo I 2024-07-04 11:54:53 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.23.0/stt_en_conformer_ctc_small/5d2d8e5b2b5adb8f5091363c6ba19c55/stt_en_conformer_ctc_small.nemo.\n",
-            "[NeMo I 2024-07-04 11:54:53 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.23.0/stt_en_conformer_ctc_small/5d2d8e5b2b5adb8f5091363c6ba19c55/stt_en_conformer_ctc_small.nemo\n",
-            "[NeMo I 2024-07-04 11:54:53 common:924] Instantiating model from pre-trained checkpoint\n"
+            "[NeMo I 2024-07-08 16:47:54 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.\n",
+            "[NeMo I 2024-07-08 16:47:54 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo\n",
+            "[NeMo I 2024-07-08 16:47:54 common:815] Instantiating model from pre-trained checkpoint\n"
           ]
         }
       ],
@@ -347,13 +386,12 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 9,
+      "execution_count": 10,
       "metadata": {
         "id": "O6xAz38-A_Bh"
       },
       "outputs": [],
       "source": [
-        "from nemo.core import adapter_mixins\n",
         "\n",
         "# Utility method to check and update the model config\n",
         "def update_model_config_to_support_adapter(model_cfg):\n",
@@ -377,7 +415,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 10,
+      "execution_count": 11,
       "metadata": {
         "id": "iyp4xUOLBi0v"
       },
@@ -405,7 +443,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 11,
+      "execution_count": 12,
       "metadata": {
         "id": "7r36mkUGBvsy"
       },
@@ -414,63 +452,85 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "[NeMo I 2024-07-04 11:54:54 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.23.0/stt_en_conformer_ctc_small/5d2d8e5b2b5adb8f5091363c6ba19c55/stt_en_conformer_ctc_small.nemo.\n",
-            "[NeMo I 2024-07-04 11:54:54 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.23.0/stt_en_conformer_ctc_small/5d2d8e5b2b5adb8f5091363c6ba19c55/stt_en_conformer_ctc_small.nemo\n",
-            "[NeMo I 2024-07-04 11:54:54 common:924] Instantiating model from pre-trained checkpoint\n",
-            "[NeMo I 2024-07-04 11:54:55 mixins:172] Tokenizer SentencePieceTokenizer initialized with 1024 tokens\n"
+            "[NeMo I 2024-07-08 16:47:54 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.\n",
+            "[NeMo I 2024-07-08 16:47:54 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo\n",
+            "[NeMo I 2024-07-08 16:47:54 common:815] Instantiating model from pre-trained checkpoint\n",
+            "[NeMo I 2024-07-08 16:47:55 mixins:172] Tokenizer SentencePieceTokenizer initialized with 128 tokens\n"
           ]
         },
         {
           "name": "stderr",
           "output_type": "stream",
           "text": [
-            "[NeMo W 2024-07-04 11:54:56 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
+            "[NeMo W 2024-07-08 16:47:55 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
             "    Train config : \n",
-            "    manifest_filepath: /data/NeMo_ASR_SET/English/v2.0/train/tarred_audio_manifest.json\n",
+            "    manifest_filepath:\n",
+            "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/tarred_audio_manifest.json\n",
+            "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket2/tarred_audio_manifest.json\n",
+            "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket3/tarred_audio_manifest.json\n",
+            "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket4/tarred_audio_manifest.json\n",
+            "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket5/tarred_audio_manifest.json\n",
+            "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket6/tarred_audio_manifest.json\n",
+            "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket7/tarred_audio_manifest.json\n",
+            "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket8/tarred_audio_manifest.json\n",
             "    sample_rate: 16000\n",
-            "    batch_size: 64\n",
+            "    batch_size: 1\n",
             "    shuffle: true\n",
-            "    num_workers: 8\n",
+            "    num_workers: 4\n",
             "    pin_memory: true\n",
             "    use_start_end_token: false\n",
             "    trim_silence: false\n",
             "    max_duration: 20.0\n",
             "    min_duration: 0.1\n",
-            "    shuffle_n: 2048\n",
             "    is_tarred: true\n",
-            "    tarred_audio_filepaths: /data/NeMo_ASR_SET/English/v2.0/train/audio__OP_0..4095_CL_.tar\n",
+            "    tarred_audio_filepaths:\n",
+            "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/audio__OP_0..8191_CL_.tar\n",
+            "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket2/audio__OP_0..8191_CL_.tar\n",
+            "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket3/audio__OP_0..8191_CL_.tar\n",
+            "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket4/audio__OP_0..8191_CL_.tar\n",
+            "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket5/audio__OP_0..8191_CL_.tar\n",
+            "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket6/audio__OP_0..8191_CL_.tar\n",
+            "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket7/audio__OP_0..8191_CL_.tar\n",
+            "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket8/audio__OP_0..8191_CL_.tar\n",
+            "    shuffle_n: 2048\n",
+            "    bucketing_strategy: synced_randomized\n",
+            "    bucketing_batch_size:\n",
+            "    - 34\n",
+            "    - 30\n",
+            "    - 26\n",
+            "    - 22\n",
+            "    - 18\n",
+            "    - 16\n",
+            "    - 12\n",
+            "    - 8\n",
             "    \n",
-            "[NeMo W 2024-07-04 11:54:56 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
+            "[NeMo W 2024-07-08 16:47:55 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
             "    Validation config : \n",
             "    manifest_filepath:\n",
-            "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-dev-other.json\n",
-            "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-dev-clean.json\n",
-            "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-test-other.json\n",
-            "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-test-clean.json\n",
+            "    - /manifests/librispeech/librivox-dev-other.json\n",
+            "    - /manifests/librispeech/librivox-dev-clean.json\n",
+            "    - /manifests/librispeech/librivox-test-other.json\n",
+            "    - /manifests/librispeech/librivox-test-clean.json\n",
             "    sample_rate: 16000\n",
-            "    batch_size: 64\n",
+            "    batch_size: 32\n",
             "    shuffle: false\n",
             "    num_workers: 8\n",
             "    pin_memory: true\n",
             "    use_start_end_token: false\n",
-            "    is_tarred: false\n",
-            "    tarred_audio_filepaths: na\n",
             "    \n",
-            "[NeMo W 2024-07-04 11:54:56 modelPT:178] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a valid configuration file to setup the test data loader(s).\n",
+            "[NeMo W 2024-07-08 16:47:55 modelPT:189] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a valid configuration file to setup the test data loader(s).\n",
             "    Test config : \n",
             "    manifest_filepath:\n",
-            "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-test-other.json\n",
-            "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-dev-clean.json\n",
-            "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-dev-other.json\n",
-            "    - /data/ASR/LibriSpeech/librispeech_withsp2/manifests/librivox-test-clean.json\n",
+            "    - /manifests/librispeech/librivox-dev-other.json\n",
+            "    - /manifests/librispeech/librivox-dev-clean.json\n",
+            "    - /manifests/librispeech/librivox-test-other.json\n",
+            "    - /manifests/librispeech/librivox-test-clean.json\n",
             "    sample_rate: 16000\n",
-            "    batch_size: 64\n",
+            "    batch_size: 32\n",
             "    shuffle: false\n",
             "    num_workers: 8\n",
             "    pin_memory: true\n",
             "    use_start_end_token: false\n",
-            "    is_tarred: false\n",
-            "    tarred_audio_filepaths: na\n",
             "    \n"
           ]
         },
@@ -478,22 +538,8 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "[NeMo I 2024-07-04 11:54:56 features:289] PADDING: 0\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "WARNING: Logging before flag parsing goes to stderr.\n",
-            "E0704 11:54:56.411043 124023995053888 driver.py:396] Call to cuInit results in CUDA_ERROR_NO_DEVICE\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[NeMo I 2024-07-04 11:54:56 save_restore_connector:249] Model EncDecCTCModelBPE was successfully restored from /root/.cache/torch/NeMo/NeMo_1.23.0/stt_en_conformer_ctc_small/5d2d8e5b2b5adb8f5091363c6ba19c55/stt_en_conformer_ctc_small.nemo.\n"
+            "[NeMo I 2024-07-08 16:47:55 features:305] PADDING: 0\n",
+            "[NeMo I 2024-07-08 16:47:57 save_restore_connector:263] Model EncDecCTCModelBPE was successfully restored from /root/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.\n"
           ]
         }
       ],
@@ -520,7 +566,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 12,
+      "execution_count": 13,
       "metadata": {
         "id": "sWRUXzjQMWN5"
       },
@@ -529,9 +575,13 @@
           "name": "stderr",
           "output_type": "stream",
           "text": [
-            "GPU available: False, used: False\n",
-            "TPU available: False, using: 0 TPU cores\n",
-            "HPU available: False, using: 0 HPUs\n"
+            "INFO: GPU available: True (cuda), used: True\n",
+            "WARNING: Logging before flag parsing goes to stderr.\n",
+            "I0708 16:47:57.231223 129166743029568 rank_zero.py:64] GPU available: True (cuda), used: True\n",
+            "INFO: TPU available: False, using: 0 TPU cores\n",
+            "I0708 16:47:57.256626 129166743029568 rank_zero.py:64] TPU available: False, using: 0 TPU cores\n",
+            "INFO: HPU available: False, using: 0 HPUs\n",
+            "I0708 16:47:57.257805 129166743029568 rank_zero.py:64] HPU available: False, using: 0 HPUs\n"
           ]
         }
       ],
@@ -548,7 +598,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 13,
+      "execution_count": 14,
       "metadata": {
         "id": "tJBriqr3tQV7"
       },
@@ -585,28 +635,20 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 14,
+      "execution_count": 15,
       "metadata": {
         "id": "WgogR3taD7NA"
       },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Base charset : [' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']\n"
-          ]
-        }
-      ],
+      "outputs": [],
       "source": [
-        "train_data = read_manifest(TRAIN_MANIFEST)\n",
-        "base_sets = [set(list(sample['text'])) for sample in train_data]\n",
-        "base_charset = set([])\n",
-        "for charset in base_sets:\n",
-        "  base_charset.update(charset)\n",
-        "base_charset = list(sorted(list(base_charset)))\n",
+        "# train_data = read_manifest(TRAIN_MANIFEST)\n",
+        "# base_sets = [set(list(sample['text'])) for sample in train_data]\n",
+        "# base_charset = set([])\n",
+        "# for charset in base_sets:\n",
+        "#   base_charset.update(charset)\n",
+        "# base_charset = list(sorted(list(base_charset)))\n",
         "\n",
-        "print(\"Base charset :\", base_charset)"
+        "# print(\"Base charset :\", base_charset)"
       ]
     },
     {
@@ -620,27 +662,27 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 16,
+      "execution_count": 34,
       "metadata": {
         "id": "5laUkRf5Eb6l"
       },
       "outputs": [],
       "source": [
-        "def check_valid_charset_in_vocab(model, charset):\n",
-        "  model_vocab = model.decoder.vocabulary\n",
-        "  num_invalid = 0\n",
+        "# def check_valid_charset_in_vocab(model, charset):\n",
+        "#   model_vocab = model.decoder.vocabulary\n",
+        "#   num_invalid = 0\n",
         "\n",
-        "  for char in charset:\n",
-        "    if char != ' ' and char not in model_vocab:\n",
-        "      print(f\"Character `{char}` does not exist in the base character set of the original model !\")\n",
-        "      num_invalid += 1\n",
+        "#   for char in charset:\n",
+        "#     if char != ' ' and char not in model_vocab:\n",
+        "#       print(f\"Character `{char}` does not exist in the base character set of the original model !\")\n",
+        "#       num_invalid += 1\n",
         "\n",
-        "  print(\"Number of invalid tokens :\", num_invalid)"
+        "#   print(\"Number of invalid tokens :\", num_invalid)"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 17,
+      "execution_count": 16,
       "metadata": {
         "id": "5rEUqs7AFh5j"
       },
@@ -654,7 +696,7 @@
         }
       ],
       "source": [
-        "check_valid_charset_in_vocab(model, base_charset)"
+        "# check_valid_charset_in_vocab(model, base_charset)"
       ]
     },
     {
@@ -670,40 +712,11 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 18,
+      "execution_count": 15,
       "metadata": {
         "id": "Ak4v4aWjGoQH"
       },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "--2024-07-04 11:20:18--  https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/asr/transcribe_speech.py\n",
-            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...\n",
-            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n",
-            "HTTP request sent, awaiting response... 200 OK\n",
-            "Length: 23427 (23K) [text/plain]\n",
-            "Saving to: ‘scripts/transcribe_speech.py’\n",
-            "\n",
-            "transcribe_speech.p 100%[===================>]  22.88K  --.-KB/s    in 0s      \n",
-            "\n",
-            "2024-07-04 11:20:18 (113 MB/s) - ‘scripts/transcribe_speech.py’ saved [23427/23427]\n",
-            "\n",
-            "--2024-07-04 11:20:19--  https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/asr/speech_to_text_eval.py\n",
-            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...\n",
-            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
-            "HTTP request sent, awaiting response... 200 OK\n",
-            "Length: 8539 (8.3K) [text/plain]\n",
-            "Saving to: ‘scripts/speech_to_text_eval.py’\n",
-            "\n",
-            "speech_to_text_eval 100%[===================>]   8.34K  --.-KB/s    in 0s      \n",
-            "\n",
-            "2024-07-04 11:20:19 (82.1 MB/s) - ‘scripts/speech_to_text_eval.py’ saved [8539/8539]\n",
-            "\n"
-          ]
-        }
-      ],
+      "outputs": [],
       "source": [
         "if not os.path.exists('scripts/transcribe_speech.py'):\n",
         "  !wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/transcribe_speech.py\n",
@@ -714,14 +727,14 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 18,
       "metadata": {
         "id": "OVlBKWCiIHw7"
       },
       "outputs": [],
       "source": [
-        "# temporarily save current model\n",
-        "model.save_to(\"/content/unadapted_model.nemo\")"
+        "# # temporarily save current model\n",
+        "# model.save_to(\"/content/unadapted_model.nemo\")"
       ]
     },
     {
@@ -737,18 +750,31 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 19,
       "metadata": {
         "id": "C6YbPt70H0-N"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Traceback (most recent call last):\n",
+            "  File \"/workspace/nemo/NeMo-opensource/tutorials/asr/asr_adapters/scripts/speech_to_text_eval.py\", line 71, in <module>\n",
+            "    import transcribe_speech\n",
+            "  File \"/workspace/nemo/NeMo-opensource/tutorials/asr/asr_adapters/scripts/transcribe_speech.py\", line 29, in <module>\n",
+            "    from nemo.collections.asr.models.aed_multitask_models import parse_multitask_prompt\n",
+            "ImportError: cannot import name 'parse_multitask_prompt' from 'nemo.collections.asr.models.aed_multitask_models' (/usr/local/lib/python3.10/dist-packages/nemo/collections/asr/models/aed_multitask_models.py)\n"
+          ]
+        }
+      ],
       "source": [
-        "!python scripts/speech_to_text_eval.py \\\n",
-        "  model_path=\"/content/unadapted_model.nemo\" \\\n",
-        "  dataset_manifest=$TEST_MANIFEST \\\n",
-        "  output_filename=\"/content/unadapted_predictions.json\" \\\n",
-        "  batch_size=32 \\\n",
-        "  use_cer=False"
+        "# !python scripts/speech_to_text_eval.py \\\n",
+        "#   model_path=\"/content/unadapted_model.nemo\" \\\n",
+        "#   dataset_manifest=$TEST_MANIFEST \\\n",
+        "#   output_filename=\"/content/unadapted_predictions.json\" \\\n",
+        "#   batch_size=32 \\\n",
+        "#   use_cer=False"
       ]
     },
     {
@@ -764,13 +790,21 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 20,
       "metadata": {
         "id": "SE8uoRLsJA9F"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "head: cannot open '/content/unadapted_predictions.json' for reading: No such file or directory\n"
+          ]
+        }
+      ],
       "source": [
-        "!head -n 5 /content/unadapted_predictions.json"
+        "# !head -n 5 /content/unadapted_predictions.json"
       ]
     },
     {
@@ -814,7 +848,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 18,
+      "execution_count": 16,
       "metadata": {
         "id": "F0GIxhyCJmFv"
       },
@@ -823,13 +857,12 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "Value(False)\n",
-            "[NeMo I 2024-07-04 12:07:47 collections:196] Dataset loaded with 948 files totalling 0.71 hours\n",
-            "[NeMo I 2024-07-04 12:07:47 collections:197] 0 files were filtered totalling 0.00 hours\n",
-            "[NeMo I 2024-07-04 16:53:51 collections:196] Dataset loaded with 130 files totalling 0.10 hours\n",
-            "[NeMo I 2024-07-04 16:53:51 collections:197] 0 files were filtered totalling 0.00 hours\n",
-            "[NeMo I 2024-07-04 16:53:52 collections:196] Dataset loaded with 130 files totalling 0.10 hours\n",
-            "[NeMo I 2024-07-04 16:53:52 collections:197] 0 files were filtered totalling 0.00 hours\n"
+            "[NeMo I 2024-07-08 16:50:44 collections:199] Dataset loaded with 948 files totalling 0.71 hours\n",
+            "[NeMo I 2024-07-08 16:50:44 collections:201] 0 files were filtered totalling 0.00 hours\n",
+            "[NeMo I 2024-07-08 16:50:44 collections:199] Dataset loaded with 130 files totalling 0.10 hours\n",
+            "[NeMo I 2024-07-08 16:50:44 collections:201] 0 files were filtered totalling 0.00 hours\n",
+            "[NeMo I 2024-07-08 16:50:44 collections:199] Dataset loaded with 130 files totalling 0.10 hours\n",
+            "[NeMo I 2024-07-08 16:50:44 collections:201] 0 files were filtered totalling 0.00 hours\n"
           ]
         }
       ],
@@ -864,7 +897,42 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 20,
+      "execution_count": 18,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "sample_rate : 16000\n",
+            "log_prediction : True\n",
+            "ctc_reduction : mean_batch\n",
+            "skip_nan_grad : False\n",
+            "train_ds : {'manifest_filepath': 'datasets/an4/train_manifest.json', 'sample_rate': 16000, 'batch_size': 32, 'shuffle': True, 'num_workers': 4, 'pin_memory': True, 'use_start_end_token': False, 'trim_silence': False, 'max_duration': 20.0, 'min_duration': 0.1, 'is_tarred': False, 'tarred_audio_filepaths': None, 'shuffle_n': 2048, 'bucketing_strategy': 'synced_randomized', 'bucketing_batch_size': [34, 30, 26, 22, 18, 16, 12, 8]}\n",
+            "validation_ds : {'manifest_filepath': 'datasets/an4/test_manifest.json', 'sample_rate': 16000, 'batch_size': 32, 'shuffle': False, 'num_workers': 8, 'pin_memory': True, 'use_start_end_token': False}\n",
+            "test_ds : {'manifest_filepath': 'datasets/an4/test_manifest.json', 'sample_rate': 16000, 'batch_size': 32, 'shuffle': False, 'num_workers': 8, 'pin_memory': True, 'use_start_end_token': False}\n",
+            "tokenizer : {'dir': '/tokenizers/NeMo_ASR_SET/English/asr_set_3.0/tokenizer_spe_unigram_v128', 'type': 'bpe', 'model_path': 'nemo:e06949b0b85a485e9f280ea6d19e5492_tokenizer.model', 'vocab_path': 'nemo:53bbc634b62446de83525753e95a50ac_vocab.txt', 'spe_tokenizer_vocab': 'nemo:ff63e3c43c5f4b95bff702425366a4a6_tokenizer.vocab'}\n",
+            "preprocessor : {'_target_': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_size': 0.025, 'window_stride': 0.01, 'window': 'hann', 'features': 80, 'n_fft': 512, 'log': True, 'frame_splicing': 1, 'dither': 1e-05, 'pad_to': 0, 'pad_value': 0.0}\n",
+            "spec_augment : {'_target_': 'nemo.collections.asr.modules.SpectrogramAugmentation', 'freq_masks': 2, 'time_masks': 10, 'freq_width': 27, 'time_width': 0.05}\n",
+            "encoder : {'_target_': 'nemo.collections.asr.modules.conformer_encoder.ConformerEncoderAdapter', 'feat_in': 80, 'feat_out': -1, 'n_layers': 18, 'd_model': 512, 'subsampling': 'striding', 'subsampling_factor': 4, 'subsampling_conv_channels': 512, 'ff_expansion_factor': 4, 'self_attention_model': 'rel_pos', 'n_heads': 8, 'att_context_size': [-1, -1], 'xscaling': True, 'untie_biases': True, 'pos_emb_max_len': 5000, 'conv_kernel_size': 31, 'conv_norm_type': 'batch_norm', 'dropout': 0.1, 'dropout_emb': 0.0, 'dropout_att': 0.1}\n",
+            "decoder : {'_target_': 'nemo.collections.asr.modules.ConvASRDecoder', 'feat_in': 512, 'num_classes': 128, 'vocabulary': ['<unk>', '▁', 's', 't', 'e', 'd', 'o', '▁the', 'a', 'i', '▁a', 'u', 'y', 'm', 'l', 'n', 'p', 're', 'c', 'h', 'r', '▁s', 'g', '▁to', 'er', 'ing', 'f', '▁and', 'an', '▁i', 'k', '▁that', \"'\", '▁of', '▁in', 'w', '▁p', 'ed', 'or', 'al', 'ar', '▁f', 'en', 'in', 'b', '▁you', '▁w', '▁b', 'le', 'll', 'es', '▁it', 've', 'ur', '▁we', '▁re', '▁be', 'ly', '▁is', '▁he', '▁o', '▁c', 'it', '▁n', '▁on', 'un', '▁t', 'on', 'se', 'th', 'ce', '▁do', 'ic', '▁for', '▁th', 'ion', 'ch', '▁was', 'ri', 'ent', '▁g', 'ver', '▁co', 'li', '▁ha', '▁ma', 'la', 'ro', 'v', 'us', '▁ca', '▁di', '▁this', 'ra', '▁st', '▁e', '▁not', '▁so', '▁de', '▁have', 'ter', 'ir', '▁go', 'ation', '▁with', 'ate', '▁me', '▁mo', 'ment', '▁con', '▁but', 'vi', '▁pro', '▁ho', 'j', '▁com', 'ight', '▁know', '▁what', 'ect', '▁ex', '▁some', '▁would', '▁like', 'x', '▁his', 'q', 'z']}\n",
+            "optim : {'name': 'adamw', 'lr': 2.0, 'betas': [0.9, 0.98], 'weight_decay': 0.001, 'sched': {'name': 'NoamAnnealing', 'd_model': 512, 'warmup_steps': 10000, 'warmup_ratio': None, 'min_lr': 1e-06}}\n",
+            "compute_eval_loss : False\n",
+            "variational_noise : {'start_step': 0, 'std': 0.0}\n",
+            "target : nemo.collections.asr.models.ctc_bpe_models.EncDecCTCModelBPE\n",
+            "nemo_version : 1.9.0rc0\n",
+            "decoding : {'strategy': 'greedy', 'preserve_alignments': None, 'compute_timestamps': None, 'word_seperator': ' ', 'ctc_timestamp_type': 'all', 'batch_dim_index': 0, 'greedy': {'preserve_alignments': False, 'compute_timestamps': False, 'preserve_frame_confidence': False, 'confidence_method_cfg': {'name': 'entropy', 'entropy_type': 'tsallis', 'alpha': 0.33, 'entropy_norm': 'exp', 'temperature': 'DEPRECATED'}}, 'beam': {'beam_size': 4, 'search_type': 'default', 'preserve_alignments': False, 'compute_timestamps': False, 'return_best_hypothesis': True, 'beam_alpha': 1.0, 'beam_beta': 0.0, 'kenlm_path': None, 'flashlight_cfg': {'lexicon_path': None, 'boost_path': None, 'beam_size_token': 16, 'beam_threshold': 20.0, 'unk_weight': -inf, 'sil_weight': 0.0}, 'pyctcdecode_cfg': {'beam_prune_logp': -10.0, 'token_min_logp': -5.0, 'prune_history': False, 'hotwords': None, 'hotword_weight': 10.0}}, 'confidence_cfg': {'preserve_frame_confidence': False, 'preserve_token_confidence': False, 'preserve_word_confidence': False, 'exclude_blank': True, 'aggregation': 'min', 'tdt_include_duration': False, 'method_cfg': {'name': 'entropy', 'entropy_type': 'tsallis', 'alpha': 0.33, 'entropy_norm': 'exp', 'temperature': 'DEPRECATED'}}, 'temperature': 1.0}\n"
+          ]
+        }
+      ],
+      "source": [
+        "for key in model.cfg.keys():\n",
+        "  print(f\"{key} : {model.cfg[key]}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 17,
       "metadata": {
         "id": "T-XFuaA3OlOB"
       },
@@ -899,7 +967,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 21,
+      "execution_count": 18,
       "metadata": {
         "id": "UDEIfMTcP6j6"
       },
@@ -931,7 +999,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 22,
+      "execution_count": 19,
       "metadata": {
         "id": "tp_8FGPcKjMd"
       },
@@ -940,7 +1008,7 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "[NeMo I 2024-07-04 19:47:39 modelPT:723] Optimizer config = AdamW (\n",
+            "[NeMo I 2024-07-08 10:01:14 modelPT:767] Optimizer config = AdamW (\n",
             "    Parameter Group 0\n",
             "        amsgrad: False\n",
             "        betas: [0.9, 0.98]\n",
@@ -953,7 +1021,7 @@
             "        maximize: False\n",
             "        weight_decay: 0.0\n",
             "    )\n",
-            "[NeMo I 2024-07-04 19:47:39 lr_scheduler:915] Scheduler \"<nemo.core.optim.lr_scheduler.NoamAnnealing object at 0x70cad0ef5a80>\" \n",
+            "[NeMo I 2024-07-08 10:01:14 lr_scheduler:923] Scheduler \"<nemo.core.optim.lr_scheduler.NoamAnnealing object at 0x7549a4f86680>\" \n",
             "    will be used during training (effective maximum steps = 300) - \n",
             "    Parameters : \n",
             "    (d_model: 176\n",
@@ -1001,7 +1069,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 23,
+      "execution_count": 43,
       "metadata": {
         "id": "fRIDhU8RVBwi"
       },
@@ -1031,7 +1099,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 24,
+      "execution_count": 42,
       "metadata": {
         "id": "iNnSp_azQ2u8"
       },
@@ -1040,10 +1108,11 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
+            "<generator object Module.children at 0x7549aa446260>\n",
             "Module :  ConformerEncoderAdapter\n",
-            "<class 'nemo.collections.common.parts.adapter_modules.LinearAdapter'>\n",
             "<class 'nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.RelPositionMultiHeadAttentionAdapter'>\n",
             "<class 'nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.MultiHeadAttentionAdapter'>\n",
+            "<class 'nemo.collections.common.parts.adapter_modules.LinearAdapter'>\n",
             "\n",
             "Module :  ConvASRDecoder\n",
             "<class 'nemo.collections.common.parts.adapter_modules.LinearAdapter'>\n",
@@ -1052,6 +1121,7 @@
         }
       ],
       "source": [
+        "print(model.children())\n",
         "for module in model.children():\n",
         "  if hasattr(module, 'get_accepted_adapter_types'):\n",
         "    types = module.get_accepted_adapter_types()\n",
@@ -1092,7 +1162,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 25,
+      "execution_count": 22,
       "metadata": {
         "id": "oZZr6vSntuyX"
       },
@@ -1103,7 +1173,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 26,
+      "execution_count": 23,
       "metadata": {
         "id": "dlj0Yud4MxOi"
       },
@@ -1119,7 +1189,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 27,
+      "execution_count": 24,
       "metadata": {
         "id": "Uv8WRQkXU3mu"
       },
@@ -1155,7 +1225,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 28,
+      "execution_count": 25,
       "metadata": {
         "id": "-MbSTbYiYtnB"
       },
@@ -1178,7 +1248,7 @@
               "52.616    Total estimated model params size (MB)"
             ]
           },
-          "execution_count": 28,
+          "execution_count": 25,
           "metadata": {},
           "output_type": "execute_result"
         }
@@ -1202,7 +1272,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 29,
+      "execution_count": 26,
       "metadata": {
         "id": "El6ewd1GX9V7"
       },
@@ -1224,7 +1294,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 30,
+      "execution_count": 27,
       "metadata": {
         "id": "rIvw0_8iYpHW"
       },
@@ -1247,7 +1317,7 @@
               "53.360    Total estimated model params size (MB)"
             ]
           },
-          "execution_count": 30,
+          "execution_count": 27,
           "metadata": {},
           "output_type": "execute_result"
         }
@@ -1273,7 +1343,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 31,
+      "execution_count": 28,
       "metadata": {
         "id": "ogUfDkjdZKHu"
       },
@@ -1282,8 +1352,8 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "[NeMo I 2024-07-04 19:48:18 adapter_mixins:719] Setting adapter 'AN4' status : Enabled = False\n",
-            "[NeMo I 2024-07-04 19:48:18 adapter_mixins:734] Setting adapter 'AN4' status : Enabled = True\n"
+            "[NeMo I 2024-07-08 10:05:36 adapter_mixins:719] Setting adapter 'AN4' status : Enabled = False\n",
+            "[NeMo I 2024-07-08 10:05:36 adapter_mixins:734] Setting adapter 'AN4' status : Enabled = True\n"
           ]
         }
       ],
@@ -1309,7 +1379,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 44,
+      "execution_count": 29,
       "metadata": {
         "id": "RN2YayAoYzaI"
       },
@@ -1318,23 +1388,23 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.0.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.1.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.2.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.3.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.4.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.5.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.6.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.7.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.8.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.9.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.10.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.11.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.12.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.13.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.14.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:405] Froze module encoder.layers.15.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-04 21:32:14 adapter_mixins:435] Unfrozen adapter : AN4\n"
+            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.0.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.1.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.2.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.3.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.4.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.5.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.6.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.7.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.8.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.9.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.10.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.11.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.12.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.13.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.14.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.15.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:435] Unfrozen adapter : AN4\n"
           ]
         }
       ],
@@ -1363,7 +1433,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 45,
+      "execution_count": 30,
       "metadata": {
         "id": "Lf3pdwQ2Zch5"
       },
@@ -1386,7 +1456,7 @@
               "53.360    Total estimated model params size (MB)"
             ]
           },
-          "execution_count": 45,
+          "execution_count": 30,
           "metadata": {},
           "output_type": "execute_result"
         }
@@ -1408,7 +1478,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 46,
+      "execution_count": 31,
       "metadata": {
         "id": "w9ciIw-2bSHq"
       },
@@ -1417,22 +1487,16 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "[NeMo I 2024-07-04 21:32:25 exp_manager:396] Experiments will be logged at experiments/ASR-Adapters/2024-07-04_21-32-25\n",
-            "[NeMo I 2024-07-04 21:32:25 exp_manager:842] TensorboardLogger has been set up\n"
+            "[NeMo I 2024-07-08 10:06:22 exp_manager:396] Experiments will be logged at experiments/ASR-Adapters/2024-07-08_10-06-22\n",
+            "[NeMo I 2024-07-08 10:06:22 exp_manager:856] TensorboardLogger has been set up\n",
+            "[NeMo I 2024-07-08 10:06:22 exp_manager:871] WandBLogger has been set up\n"
           ]
         },
         {
           "name": "stderr",
           "output_type": "stream",
           "text": [
-            "[NeMo W 2024-07-04 21:32:25 exp_manager:952] The checkpoint callback was told to monitor a validation value and trainer's max_steps was set to 300. Please ensure that max_steps will run for at least 3 epochs to ensure that checkpointing will not error out.\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[NeMo I 2024-07-04 21:32:25 exp_manager:971] Preemption is supported only on GPUs, disabling preemption\n"
+            "[NeMo W 2024-07-08 10:06:22 exp_manager:966] The checkpoint callback was told to monitor a validation value and trainer's max_steps was set to 300. Please ensure that max_steps will run for at least 3 epochs to ensure that checkpointing will not error out.\n"
           ]
         }
       ],
@@ -1440,6 +1504,7 @@
         "# Prepare NeMo's Experiment manager to handle checkpoint saving and logging for us\n",
         "from nemo.utils import exp_manager\n",
         "\n",
+        "\n",
         "# Environment variable generally used for multi-node multi-gpu training.\n",
         "# In notebook environments, this flag is unnecessary and can cause logs of multiple training runs to overwrite each other.\n",
         "os.environ.pop('NEMO_EXPM_VERSION', None)\n",
@@ -1453,6 +1518,8 @@
         "        always_save_nemo=True,\n",
         "        save_best_model=True,\n",
         "    ),\n",
+        "    create_wandb_logger=True,\n",
+        "    wandb_logger_kwargs=OmegaConf.create({\"project\": \"NEMO_TEST\", \"name\": \"ASR-Adapters\", \"log_model\":\"all\"}),\n",
         ")\n",
         "\n",
         "exp_config = OmegaConf.structured(exp_config)\n",
@@ -1462,16 +1529,23 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 47,
+      "execution_count": 33,
       "metadata": {
         "id": "cY2TJod3ZfyE"
       },
       "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
+          ]
+        },
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "[NeMo I 2024-07-04 21:32:36 modelPT:723] Optimizer config = AdamW (\n",
+            "[NeMo I 2024-07-07 13:27:08 modelPT:767] Optimizer config = AdamW (\n",
             "    Parameter Group 0\n",
             "        amsgrad: False\n",
             "        betas: [0.9, 0.98]\n",
@@ -1484,7 +1558,7 @@
             "        maximize: False\n",
             "        weight_decay: 0.0\n",
             "    )\n",
-            "[NeMo I 2024-07-04 21:32:36 lr_scheduler:915] Scheduler \"<nemo.core.optim.lr_scheduler.NoamAnnealing object at 0x70cac068d9c0>\" \n",
+            "[NeMo I 2024-07-07 13:27:08 lr_scheduler:923] Scheduler \"<nemo.core.optim.lr_scheduler.NoamAnnealing object at 0x77979d4efc70>\" \n",
             "    will be used during training (effective maximum steps = 300) - \n",
             "    Parameters : \n",
             "    (d_model: 176\n",
@@ -1518,7 +1592,7 @@
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "0d39171be96b430d8599e806ecaedd7a",
+              "model_id": "b9415ef40e0b4ef581d18cb559be9768",
               "version_major": 2,
               "version_minor": 0
             },
@@ -1533,20 +1607,28 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "[NeMo I 2024-07-04 21:32:38 wer:318] \n",
+            "[NeMo I 2024-07-07 13:27:08 wer:334] \n",
             "    \n",
-            "[NeMo I 2024-07-04 21:32:38 wer:319] reference:rubout g m e f three nine\n",
-            "[NeMo I 2024-07-04 21:32:38 wer:320] predicted:rabutt g m e f three nine\n",
-            "[NeMo I 2024-07-04 21:32:39 wer:318] \n",
+            "[NeMo I 2024-07-07 13:27:08 wer:335] reference:rubout g m e f three nine\n",
+            "[NeMo I 2024-07-07 13:27:08 wer:336] predicted:rabutt g m e f three nine\n",
+            "[NeMo I 2024-07-07 13:27:08 wer:334] \n",
             "    \n",
-            "[NeMo I 2024-07-04 21:32:39 wer:319] reference:v a n e s s a\n",
-            "[NeMo I 2024-07-04 21:32:39 wer:320] predicted:v a n e s s a\n"
+            "[NeMo I 2024-07-07 13:27:08 wer:335] reference:v a n e s s a\n",
+            "[NeMo I 2024-07-07 13:27:08 wer:336] predicted:v a n e s s a\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "[NeMo W 2024-07-07 13:27:08 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.\n",
+            "    \n"
           ]
         },
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "04f304daeae64bd5a67b5443f907c14f",
+              "model_id": "535776581b0e4d46aa73cc80ce5a8ddd",
               "version_major": 2,
               "version_minor": 0
             },
@@ -1557,66 +1639,428 @@
           "metadata": {},
           "output_type": "display_data"
         },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo I 2024-07-07 13:27:08 preemption:56] Preemption requires torch distributed to be initialized, disabling preemption\n",
+            "[NeMo I 2024-07-07 13:27:09 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:09 wer:335] reference:w h i t n e y\n",
+            "[NeMo I 2024-07-07 13:27:09 wer:336] predicted:w h i t n e y\n",
+            "[NeMo I 2024-07-07 13:27:10 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:10 wer:335] reference:one five oh one five\n",
+            "[NeMo I 2024-07-07 13:27:10 wer:336] predicted:one five oh one five\n",
+            "[NeMo I 2024-07-07 13:27:10 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:10 wer:335] reference:one five two one three\n",
+            "[NeMo I 2024-07-07 13:27:10 wer:336] predicted:one five two one three\n",
+            "[NeMo I 2024-07-07 13:27:11 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:11 wer:335] reference:l i n d a\n",
+            "[NeMo I 2024-07-07 13:27:11 wer:336] predicted:l i n d a\n",
+            "[NeMo I 2024-07-07 13:27:12 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:12 wer:335] reference:i t h a c a\n",
+            "[NeMo I 2024-07-07 13:27:12 wer:336] predicted:i t h a c a\n",
+            "[NeMo I 2024-07-07 13:27:12 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:12 wer:335] reference:one five two two one\n",
+            "[NeMo I 2024-07-07 13:27:12 wer:336] predicted:one five two two one\n",
+            "[NeMo I 2024-07-07 13:27:13 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:13 wer:335] reference:p t q d q one\n",
+            "[NeMo I 2024-07-07 13:27:13 wer:336] predicted:p t q d q one\n",
+            "[NeMo I 2024-07-07 13:27:13 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:13 wer:335] reference:b a i q w eight\n",
+            "[NeMo I 2024-07-07 13:27:13 wer:336] predicted:b a i q w eight\n",
+            "[NeMo I 2024-07-07 13:27:14 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:14 wer:335] reference:nine three five oh six nine two\n",
+            "[NeMo I 2024-07-07 13:27:14 wer:336] predicted:nine three five oh six nine two\n",
+            "[NeMo I 2024-07-07 13:27:15 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:15 wer:335] reference:u s i q n seventy one\n",
+            "[NeMo I 2024-07-07 13:27:15 wer:336] predicted:u s i q n seventy one\n",
+            "[NeMo I 2024-07-07 13:27:15 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:15 wer:335] reference:six zero one five\n",
+            "[NeMo I 2024-07-07 13:27:15 wer:336] predicted:six zero one five\n",
+            "[NeMo I 2024-07-07 13:27:16 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:16 wer:335] reference:o a k d a l e d r i v e\n",
+            "[NeMo I 2024-07-07 13:27:16 wer:336] predicted:o a k d a l e d r i v e\n",
+            "[NeMo I 2024-07-07 13:27:16 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:16 wer:335] reference:two two nine three\n",
+            "[NeMo I 2024-07-07 13:27:16 wer:336] predicted:two two nine three\n",
+            "[NeMo I 2024-07-07 13:27:17 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:17 wer:335] reference:b a r r y\n",
+            "[NeMo I 2024-07-07 13:27:17 wer:336] predicted:b a r r y\n",
+            "[NeMo I 2024-07-07 13:27:17 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:17 wer:335] reference:repeat\n",
+            "[NeMo I 2024-07-07 13:27:17 wer:336] predicted:repeat\n",
+            "[NeMo I 2024-07-07 13:27:18 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:18 wer:335] reference:p i t t s b u r g h\n",
+            "[NeMo I 2024-07-07 13:27:18 wer:336] predicted:p i t t s b u r g h\n",
+            "[NeMo I 2024-07-07 13:27:19 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:19 wer:335] reference:one five two four one\n",
+            "[NeMo I 2024-07-07 13:27:19 wer:336] predicted:one five two four one\n",
+            "[NeMo I 2024-07-07 13:27:19 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:19 wer:335] reference:rubout f y n p ninety seven\n",
+            "[NeMo I 2024-07-07 13:27:19 wer:336] predicted:rubout f y n p ninety seven\n"
+          ]
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "5738aa22127a4221bf4c431cdd5479f7",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Validation: |          | 0/? [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo I 2024-07-07 13:27:20 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:20 wer:335] reference:rubout g m e f three nine\n",
+            "[NeMo I 2024-07-07 13:27:20 wer:336] predicted:ruboutut g m e f three nine\n",
+            "[NeMo I 2024-07-07 13:27:20 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:20 wer:335] reference:v a n e s s a\n",
+            "[NeMo I 2024-07-07 13:27:20 wer:336] predicted:v a n e s s a\n",
+            "[NeMo I 2024-07-07 13:27:20 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:20 wer:335] reference:march seven nineteen sixty seven\n",
+            "[NeMo I 2024-07-07 13:27:20 wer:336] predicted:march seven nineteen sixty seven\n",
+            "[NeMo I 2024-07-07 13:27:20 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:20 wer:335] reference:m y e r s\n",
+            "[NeMo I 2024-07-07 13:27:20 wer:336] predicted:m y e r s\n",
+            "[NeMo I 2024-07-07 13:27:20 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:20 wer:335] reference:four one two two six eight four one four two\n",
+            "[NeMo I 2024-07-07 13:27:20 wer:336] predicted:four one two two six eight four one four two\n"
+          ]
+        },
         {
           "name": "stderr",
           "output_type": "stream",
           "text": [
-            "ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).\n",
-            "\u0000ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).\n",
-            "\u0000"
+            "INFO: Epoch 2, global step 90: 'val_wer' reached 0.01423 (best 0.01423), saving model to '/workspace/nemo/NeMo-opensource/tutorials/asr/asr_adapters/experiments/ASR-Adapters/2024-07-07_13-24-36/checkpoints/ASR-Adapters--val_wer=0.0142-epoch=2.ckpt' as top 3\n",
+            "I0707 13:27:20.423012 131498730960704 rank_zero.py:64] Epoch 2, global step 90: 'val_wer' reached 0.01423 (best 0.01423), saving model to '/workspace/nemo/NeMo-opensource/tutorials/asr/asr_adapters/experiments/ASR-Adapters/2024-07-07_13-24-36/checkpoints/ASR-Adapters--val_wer=0.0142-epoch=2.ckpt' as top 3\n"
           ]
         },
         {
-          "ename": "RuntimeError",
-          "evalue": "DataLoader worker (pid 2281096) is killed by signal: Bus error. It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit.",
-          "output_type": "error",
-          "traceback": [
-            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-            "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
-            "Cell \u001b[0;32mIn[47], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# Finally, train the adapters\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m)\u001b[49m\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py:543\u001b[0m, in \u001b[0;36mTrainer.fit\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m    541\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mstatus \u001b[38;5;241m=\u001b[39m TrainerStatus\u001b[38;5;241m.\u001b[39mRUNNING\n\u001b[1;32m    542\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 543\u001b[0m \u001b[43mcall\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_and_handle_interrupt\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    544\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_impl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mval_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdatamodule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\n\u001b[1;32m    545\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py:44\u001b[0m, in \u001b[0;36m_call_and_handle_interrupt\u001b[0;34m(trainer, trainer_fn, *args, **kwargs)\u001b[0m\n\u001b[1;32m     42\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mlauncher \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     43\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mlauncher\u001b[38;5;241m.\u001b[39mlaunch(trainer_fn, \u001b[38;5;241m*\u001b[39margs, trainer\u001b[38;5;241m=\u001b[39mtrainer, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m---> 44\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtrainer_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     46\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m _TunerExitException:\n\u001b[1;32m     47\u001b[0m     _call_teardown_hook(trainer)\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py:579\u001b[0m, in \u001b[0;36mTrainer._fit_impl\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m    572\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mfn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    573\u001b[0m ckpt_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_checkpoint_connector\u001b[38;5;241m.\u001b[39m_select_ckpt_path(\n\u001b[1;32m    574\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mfn,\n\u001b[1;32m    575\u001b[0m     ckpt_path,\n\u001b[1;32m    576\u001b[0m     model_provided\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m    577\u001b[0m     model_connected\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlightning_module \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    578\u001b[0m )\n\u001b[0;32m--> 579\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mckpt_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    581\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mstopped\n\u001b[1;32m    582\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py:986\u001b[0m, in \u001b[0;36mTrainer._run\u001b[0;34m(self, model, ckpt_path)\u001b[0m\n\u001b[1;32m    981\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_signal_connector\u001b[38;5;241m.\u001b[39mregister_signal_handlers()\n\u001b[1;32m    983\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[1;32m    984\u001b[0m \u001b[38;5;66;03m# RUN THE TRAINER\u001b[39;00m\n\u001b[1;32m    985\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[0;32m--> 986\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_stage\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    988\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[1;32m    989\u001b[0m \u001b[38;5;66;03m# POST-Training CLEAN UP\u001b[39;00m\n\u001b[1;32m    990\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[1;32m    991\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: trainer tearing down\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py:1030\u001b[0m, in \u001b[0;36mTrainer._run_stage\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1028\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_run_sanity_check()\n\u001b[1;32m   1029\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mautograd\u001b[38;5;241m.\u001b[39mset_detect_anomaly(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_detect_anomaly):\n\u001b[0;32m-> 1030\u001b[0m         \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_loop\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1031\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1032\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnexpected state \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:205\u001b[0m, in \u001b[0;36m_FitLoop.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    203\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    204\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mon_advance_start()\n\u001b[0;32m--> 205\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43madvance\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    206\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mon_advance_end()\n\u001b[1;32m    207\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_restarting \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:363\u001b[0m, in \u001b[0;36m_FitLoop.advance\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    361\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtrainer\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mprofile(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrun_training_epoch\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    362\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_data_fetcher \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 363\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mepoch_loop\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_data_fetcher\u001b[49m\u001b[43m)\u001b[49m\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/training_epoch_loop.py:140\u001b[0m, in \u001b[0;36m_TrainingEpochLoop.run\u001b[0;34m(self, data_fetcher)\u001b[0m\n\u001b[1;32m    138\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdone:\n\u001b[1;32m    139\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 140\u001b[0m         \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43madvance\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata_fetcher\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    141\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mon_advance_end(data_fetcher)\n\u001b[1;32m    142\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_restarting \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/training_epoch_loop.py:250\u001b[0m, in \u001b[0;36m_TrainingEpochLoop.advance\u001b[0;34m(self, data_fetcher)\u001b[0m\n\u001b[1;32m    247\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mprofile(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrun_training_batch\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m    248\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mlightning_module\u001b[38;5;241m.\u001b[39mautomatic_optimization:\n\u001b[1;32m    249\u001b[0m         \u001b[38;5;66;03m# in automatic optimization, there can only be one optimizer\u001b[39;00m\n\u001b[0;32m--> 250\u001b[0m         batch_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautomatic_optimization\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptimizers\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    251\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    252\u001b[0m         batch_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmanual_optimization\u001b[38;5;241m.\u001b[39mrun(kwargs)\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/optimization/automatic.py:190\u001b[0m, in \u001b[0;36m_AutomaticOptimization.run\u001b[0;34m(self, optimizer, batch_idx, kwargs)\u001b[0m\n\u001b[1;32m    183\u001b[0m         closure()\n\u001b[1;32m    185\u001b[0m \u001b[38;5;66;03m# ------------------------------\u001b[39;00m\n\u001b[1;32m    186\u001b[0m \u001b[38;5;66;03m# BACKWARD PASS\u001b[39;00m\n\u001b[1;32m    187\u001b[0m \u001b[38;5;66;03m# ------------------------------\u001b[39;00m\n\u001b[1;32m    188\u001b[0m \u001b[38;5;66;03m# gradient update with accumulated gradients\u001b[39;00m\n\u001b[1;32m    189\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 190\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_optimizer_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbatch_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mclosure\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    192\u001b[0m result \u001b[38;5;241m=\u001b[39m closure\u001b[38;5;241m.\u001b[39mconsume_result()\n\u001b[1;32m    193\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m result\u001b[38;5;241m.\u001b[39mloss \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/optimization/automatic.py:268\u001b[0m, in \u001b[0;36m_AutomaticOptimization._optimizer_step\u001b[0;34m(self, batch_idx, train_step_and_backward_closure)\u001b[0m\n\u001b[1;32m    265\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptim_progress\u001b[38;5;241m.\u001b[39moptimizer\u001b[38;5;241m.\u001b[39mstep\u001b[38;5;241m.\u001b[39mincrement_ready()\n\u001b[1;32m    267\u001b[0m \u001b[38;5;66;03m# model hook\u001b[39;00m\n\u001b[0;32m--> 268\u001b[0m \u001b[43mcall\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_lightning_module_hook\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    269\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtrainer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    270\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43moptimizer_step\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m    271\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcurrent_epoch\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    272\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbatch_idx\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    273\u001b[0m \u001b[43m    \u001b[49m\u001b[43moptimizer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    274\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtrain_step_and_backward_closure\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    275\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    277\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m should_accumulate:\n\u001b[1;32m    278\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptim_progress\u001b[38;5;241m.\u001b[39moptimizer\u001b[38;5;241m.\u001b[39mstep\u001b[38;5;241m.\u001b[39mincrement_completed()\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py:159\u001b[0m, in \u001b[0;36m_call_lightning_module_hook\u001b[0;34m(trainer, hook_name, pl_module, *args, **kwargs)\u001b[0m\n\u001b[1;32m    156\u001b[0m pl_module\u001b[38;5;241m.\u001b[39m_current_fx_name \u001b[38;5;241m=\u001b[39m hook_name\n\u001b[1;32m    158\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mprofile(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m[LightningModule]\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpl_module\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhook_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 159\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    161\u001b[0m \u001b[38;5;66;03m# restore current_fx when nested context\u001b[39;00m\n\u001b[1;32m    162\u001b[0m pl_module\u001b[38;5;241m.\u001b[39m_current_fx_name \u001b[38;5;241m=\u001b[39m prev_fx_name\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/module.py:1308\u001b[0m, in \u001b[0;36mLightningModule.optimizer_step\u001b[0;34m(self, epoch, batch_idx, optimizer, optimizer_closure)\u001b[0m\n\u001b[1;32m   1277\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21moptimizer_step\u001b[39m(\n\u001b[1;32m   1278\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m   1279\u001b[0m     epoch: \u001b[38;5;28mint\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1282\u001b[0m     optimizer_closure: Optional[Callable[[], Any]] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m   1283\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   1284\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"Override this method to adjust the default way the :class:`~pytorch_lightning.trainer.trainer.Trainer` calls\u001b[39;00m\n\u001b[1;32m   1285\u001b[0m \u001b[38;5;124;03m    the optimizer.\u001b[39;00m\n\u001b[1;32m   1286\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1306\u001b[0m \n\u001b[1;32m   1307\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1308\u001b[0m     \u001b[43moptimizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43mclosure\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptimizer_closure\u001b[49m\u001b[43m)\u001b[49m\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/optimizer.py:153\u001b[0m, in \u001b[0;36mLightningOptimizer.step\u001b[0;34m(self, closure, **kwargs)\u001b[0m\n\u001b[1;32m    150\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m MisconfigurationException(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWhen `optimizer.step(closure)` is called, the closure should be callable\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    152\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_strategy \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m step_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_strategy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptimizer_step\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_optimizer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mclosure\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    155\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_on_after_step()\n\u001b[1;32m    157\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m step_output\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/strategy.py:238\u001b[0m, in \u001b[0;36mStrategy.optimizer_step\u001b[0;34m(self, optimizer, closure, model, **kwargs)\u001b[0m\n\u001b[1;32m    236\u001b[0m \u001b[38;5;66;03m# TODO(fabric): remove assertion once strategy's optimizer_step typing is fixed\u001b[39;00m\n\u001b[1;32m    237\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(model, pl\u001b[38;5;241m.\u001b[39mLightningModule)\n\u001b[0;32m--> 238\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprecision_plugin\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptimizer_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43moptimizer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mclosure\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclosure\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/plugins/precision/precision.py:122\u001b[0m, in \u001b[0;36mPrecision.optimizer_step\u001b[0;34m(self, optimizer, model, closure, **kwargs)\u001b[0m\n\u001b[1;32m    120\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Hook to run the optimizer step.\"\"\"\u001b[39;00m\n\u001b[1;32m    121\u001b[0m closure \u001b[38;5;241m=\u001b[39m partial(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_wrap_closure, model, optimizer, closure)\n\u001b[0;32m--> 122\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43moptimizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43mclosure\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclosure\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py:68\u001b[0m, in \u001b[0;36mLRScheduler.__init__.<locals>.with_counter.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     66\u001b[0m instance\u001b[38;5;241m.\u001b[39m_step_count \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m     67\u001b[0m wrapped \u001b[38;5;241m=\u001b[39m func\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__get__\u001b[39m(instance, \u001b[38;5;28mcls\u001b[39m)\n\u001b[0;32m---> 68\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mwrapped\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/optim/optimizer.py:76\u001b[0m, in \u001b[0;36m_use_grad_for_differentiable.<locals>._use_grad\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m     74\u001b[0m     torch\u001b[38;5;241m.\u001b[39mset_grad_enabled(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefaults[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdifferentiable\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m     75\u001b[0m     torch\u001b[38;5;241m.\u001b[39m_dynamo\u001b[38;5;241m.\u001b[39mgraph_break()\n\u001b[0;32m---> 76\u001b[0m     ret \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     77\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m     78\u001b[0m     torch\u001b[38;5;241m.\u001b[39m_dynamo\u001b[38;5;241m.\u001b[39mgraph_break()\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/optim/adamw.py:164\u001b[0m, in \u001b[0;36mAdamW.step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m    162\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m closure \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    163\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39menable_grad():\n\u001b[0;32m--> 164\u001b[0m         loss \u001b[38;5;241m=\u001b[39m \u001b[43mclosure\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    166\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m group \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mparam_groups:\n\u001b[1;32m    167\u001b[0m     params_with_grad \u001b[38;5;241m=\u001b[39m []\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/plugins/precision/precision.py:108\u001b[0m, in \u001b[0;36mPrecision._wrap_closure\u001b[0;34m(self, model, optimizer, closure)\u001b[0m\n\u001b[1;32m     95\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_wrap_closure\u001b[39m(\n\u001b[1;32m     96\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m     97\u001b[0m     model: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpl.LightningModule\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     98\u001b[0m     optimizer: Optimizer,\n\u001b[1;32m     99\u001b[0m     closure: Callable[[], Any],\n\u001b[1;32m    100\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[1;32m    101\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"This double-closure allows makes sure the ``closure`` is executed before the ``on_before_optimizer_step``\u001b[39;00m\n\u001b[1;32m    102\u001b[0m \u001b[38;5;124;03m    hook is called.\u001b[39;00m\n\u001b[1;32m    103\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    106\u001b[0m \n\u001b[1;32m    107\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 108\u001b[0m     closure_result \u001b[38;5;241m=\u001b[39m \u001b[43mclosure\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    109\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_after_closure(model, optimizer)\n\u001b[1;32m    110\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m closure_result\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/optimization/automatic.py:144\u001b[0m, in \u001b[0;36mClosure.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    142\u001b[0m \u001b[38;5;129m@override\u001b[39m\n\u001b[1;32m    143\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs: Any, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Optional[Tensor]:\n\u001b[0;32m--> 144\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclosure\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    145\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_result\u001b[38;5;241m.\u001b[39mloss\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py:115\u001b[0m, in \u001b[0;36mcontext_decorator.<locals>.decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    112\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m    113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m    114\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 115\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/optimization/automatic.py:129\u001b[0m, in \u001b[0;36mClosure.closure\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    126\u001b[0m \u001b[38;5;129m@override\u001b[39m\n\u001b[1;32m    127\u001b[0m \u001b[38;5;129m@torch\u001b[39m\u001b[38;5;241m.\u001b[39menable_grad()\n\u001b[1;32m    128\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mclosure\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs: Any, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m ClosureResult:\n\u001b[0;32m--> 129\u001b[0m     step_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_step_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    131\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m step_output\u001b[38;5;241m.\u001b[39mclosure_loss \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    132\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mwarning_cache\u001b[38;5;241m.\u001b[39mwarn(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m`training_step` returned `None`. If this was on purpose, ignore this warning...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/optimization/automatic.py:317\u001b[0m, in \u001b[0;36m_AutomaticOptimization._training_step\u001b[0;34m(self, kwargs)\u001b[0m\n\u001b[1;32m    306\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Performs the actual train step with the tied hooks.\u001b[39;00m\n\u001b[1;32m    307\u001b[0m \n\u001b[1;32m    308\u001b[0m \u001b[38;5;124;03mArgs:\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    313\u001b[0m \n\u001b[1;32m    314\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    315\u001b[0m trainer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtrainer\n\u001b[0;32m--> 317\u001b[0m training_step_output \u001b[38;5;241m=\u001b[39m \u001b[43mcall\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_strategy_hook\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrainer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtraining_step\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    318\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtrainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mpost_training_step()  \u001b[38;5;66;03m# unused hook - call anyway for backward compatibility\u001b[39;00m\n\u001b[1;32m    320\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m training_step_output \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mworld_size \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py:311\u001b[0m, in \u001b[0;36m_call_strategy_hook\u001b[0;34m(trainer, hook_name, *args, **kwargs)\u001b[0m\n\u001b[1;32m    308\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    310\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mprofile(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m[Strategy]\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtrainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhook_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 311\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    313\u001b[0m \u001b[38;5;66;03m# restore current_fx when nested context\u001b[39;00m\n\u001b[1;32m    314\u001b[0m pl_module\u001b[38;5;241m.\u001b[39m_current_fx_name \u001b[38;5;241m=\u001b[39m prev_fx_name\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/strategy.py:390\u001b[0m, in \u001b[0;36mStrategy.training_step\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    388\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlightning_module:\n\u001b[1;32m    389\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_redirection(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlightning_module, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtraining_step\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 390\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlightning_module\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/nemo/utils/model_utils.py:381\u001b[0m, in \u001b[0;36mwrap_training_step\u001b[0;34m(wrapped, instance, args, kwargs)\u001b[0m\n\u001b[1;32m    379\u001b[0m \u001b[38;5;129m@wrapt\u001b[39m\u001b[38;5;241m.\u001b[39mdecorator\n\u001b[1;32m    380\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrap_training_step\u001b[39m(wrapped, instance: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpl.LightningModule\u001b[39m\u001b[38;5;124m'\u001b[39m, args, kwargs):\n\u001b[0;32m--> 381\u001b[0m     output_dict \u001b[38;5;241m=\u001b[39m \u001b[43mwrapped\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    383\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(output_dict, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m output_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlog\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;129;01min\u001b[39;00m output_dict:\n\u001b[1;32m    384\u001b[0m         log_dict \u001b[38;5;241m=\u001b[39m output_dict\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlog\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/nemo/collections/asr/models/ctc_models.py:591\u001b[0m, in \u001b[0;36mEncDecCTCModel.training_step\u001b[0;34m(self, batch, batch_nb)\u001b[0m\n\u001b[1;32m    587\u001b[0m     log_probs, encoded_len, predictions \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mforward(\n\u001b[1;32m    588\u001b[0m         processed_signal\u001b[38;5;241m=\u001b[39msignal, processed_signal_length\u001b[38;5;241m=\u001b[39msignal_len\n\u001b[1;32m    589\u001b[0m     )\n\u001b[1;32m    590\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 591\u001b[0m     log_probs, encoded_len, predictions \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_signal\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msignal\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minput_signal_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msignal_len\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    593\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_trainer\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_trainer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    594\u001b[0m     log_every_n_steps \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_trainer\u001b[38;5;241m.\u001b[39mlog_every_n_steps\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/nemo/core/classes/common.py:1098\u001b[0m, in \u001b[0;36mtypecheck.__call__\u001b[0;34m(self, wrapped, instance, args, kwargs)\u001b[0m\n\u001b[1;32m   1095\u001b[0m instance\u001b[38;5;241m.\u001b[39m_validate_input_types(input_types\u001b[38;5;241m=\u001b[39minput_types, ignore_collections\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mignore_collections, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1097\u001b[0m \u001b[38;5;66;03m# Call the method - this can be forward, or any other callable method\u001b[39;00m\n\u001b[0;32m-> 1098\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mwrapped\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1100\u001b[0m instance\u001b[38;5;241m.\u001b[39m_attach_and_validate_output_types(\n\u001b[1;32m   1101\u001b[0m     output_types\u001b[38;5;241m=\u001b[39moutput_types, ignore_collections\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mignore_collections, out_objects\u001b[38;5;241m=\u001b[39moutputs\n\u001b[1;32m   1102\u001b[0m )\n\u001b[1;32m   1104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m outputs\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/nemo/collections/asr/models/ctc_models.py:564\u001b[0m, in \u001b[0;36mEncDecCTCModel.forward\u001b[0;34m(self, input_signal, input_signal_length, processed_signal, processed_signal_length)\u001b[0m\n\u001b[1;32m    561\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mspec_augmentation \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining:\n\u001b[1;32m    562\u001b[0m     processed_signal \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mspec_augmentation(input_spec\u001b[38;5;241m=\u001b[39mprocessed_signal, length\u001b[38;5;241m=\u001b[39mprocessed_signal_length)\n\u001b[0;32m--> 564\u001b[0m encoder_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoder\u001b[49m\u001b[43m(\u001b[49m\u001b[43maudio_signal\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprocessed_signal\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlength\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprocessed_signal_length\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    565\u001b[0m encoded \u001b[38;5;241m=\u001b[39m encoder_output[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    566\u001b[0m encoded_len \u001b[38;5;241m=\u001b[39m encoder_output[\u001b[38;5;241m1\u001b[39m]\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1510\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1508\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1509\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1510\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1519\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1514\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1515\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1516\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1517\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1518\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1519\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1521\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1522\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/nemo/core/classes/common.py:1098\u001b[0m, in \u001b[0;36mtypecheck.__call__\u001b[0;34m(self, wrapped, instance, args, kwargs)\u001b[0m\n\u001b[1;32m   1095\u001b[0m instance\u001b[38;5;241m.\u001b[39m_validate_input_types(input_types\u001b[38;5;241m=\u001b[39minput_types, ignore_collections\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mignore_collections, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1097\u001b[0m \u001b[38;5;66;03m# Call the method - this can be forward, or any other callable method\u001b[39;00m\n\u001b[0;32m-> 1098\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mwrapped\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1100\u001b[0m instance\u001b[38;5;241m.\u001b[39m_attach_and_validate_output_types(\n\u001b[1;32m   1101\u001b[0m     output_types\u001b[38;5;241m=\u001b[39moutput_types, ignore_collections\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mignore_collections, out_objects\u001b[38;5;241m=\u001b[39moutputs\n\u001b[1;32m   1102\u001b[0m )\n\u001b[1;32m   1104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m outputs\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/nemo/collections/asr/modules/conformer_encoder.py:491\u001b[0m, in \u001b[0;36mConformerEncoder.forward\u001b[0;34m(self, audio_signal, length, cache_last_channel, cache_last_time, cache_last_channel_len)\u001b[0m\n\u001b[1;32m    487\u001b[0m \u001b[38;5;129m@typecheck\u001b[39m()\n\u001b[1;32m    488\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m    489\u001b[0m     \u001b[38;5;28mself\u001b[39m, audio_signal, length, cache_last_channel\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, cache_last_time\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, cache_last_channel_len\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    490\u001b[0m ):\n\u001b[0;32m--> 491\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward_internal\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    492\u001b[0m \u001b[43m        \u001b[49m\u001b[43maudio_signal\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    493\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlength\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    494\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcache_last_channel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_last_channel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    495\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcache_last_time\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_last_time\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    496\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcache_last_channel_len\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_last_channel_len\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    497\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/nemo/collections/asr/modules/conformer_encoder.py:521\u001b[0m, in \u001b[0;36mConformerEncoder.forward_internal\u001b[0;34m(self, audio_signal, length, cache_last_channel, cache_last_time, cache_last_channel_len)\u001b[0m\n\u001b[1;32m    519\u001b[0m     audio_signal \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpre_encode(audio_signal)\n\u001b[1;32m    520\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 521\u001b[0m     audio_signal, length \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpre_encode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maudio_signal\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlengths\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlength\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    522\u001b[0m     length \u001b[38;5;241m=\u001b[39m length\u001b[38;5;241m.\u001b[39mto(torch\u001b[38;5;241m.\u001b[39mint64)\n\u001b[1;32m    523\u001b[0m     \u001b[38;5;66;03m# self.streaming_cfg is set by setup_streaming_cfg(), called in the init\u001b[39;00m\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1510\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1508\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1509\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1510\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1519\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1514\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1515\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1516\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1517\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1518\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1519\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1521\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1522\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/nemo/collections/asr/parts/submodules/subsampling.py:432\u001b[0m, in \u001b[0;36mConvSubsampling.forward\u001b[0;34m(self, x, lengths)\u001b[0m\n\u001b[1;32m    430\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconv2d_subsampling:\n\u001b[1;32m    431\u001b[0m     b, c, t, f \u001b[38;5;241m=\u001b[39m x\u001b[38;5;241m.\u001b[39msize()\n\u001b[0;32m--> 432\u001b[0m     x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mout\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtranspose\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreshape\u001b[49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    433\u001b[0m \u001b[38;5;66;03m# Transpose to Channel Last mode\u001b[39;00m\n\u001b[1;32m    434\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    435\u001b[0m     x \u001b[38;5;241m=\u001b[39m x\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m)\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1510\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1508\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1509\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1510\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1519\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1514\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1515\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1516\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1517\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1518\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1519\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1521\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1522\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/linear.py:114\u001b[0m, in \u001b[0;36mLinear.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m    113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 114\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlinear\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m)\u001b[49m\n",
-            "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/signal_handling.py:66\u001b[0m, in \u001b[0;36m_set_SIGCHLD_handler.<locals>.handler\u001b[0;34m(signum, frame)\u001b[0m\n\u001b[1;32m     63\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mhandler\u001b[39m(signum, frame):\n\u001b[1;32m     64\u001b[0m     \u001b[38;5;66;03m# This following call uses `waitid` with WNOHANG from C side. Therefore,\u001b[39;00m\n\u001b[1;32m     65\u001b[0m     \u001b[38;5;66;03m# Python can still get and update the process status successfully.\u001b[39;00m\n\u001b[0;32m---> 66\u001b[0m     \u001b[43m_error_if_any_worker_fails\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     67\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m previous_handler \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     68\u001b[0m         \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mcallable\u001b[39m(previous_handler)\n",
-            "\u001b[0;31mRuntimeError\u001b[0m: DataLoader worker (pid 2281096) is killed by signal: Bus error. It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit."
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo I 2024-07-07 13:27:27 nemo_model_checkpoint:217] New best .nemo model saved to: /workspace/nemo/NeMo-opensource/tutorials/asr/asr_adapters/experiments/ASR-Adapters/2024-07-07_13-24-36/checkpoints/ASR-Adapters.nemo\n",
+            "[NeMo I 2024-07-07 13:27:28 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:28 wer:335] reference:no\n",
+            "[NeMo I 2024-07-07 13:27:28 wer:336] predicted:no\n",
+            "[NeMo I 2024-07-07 13:27:28 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:28 wer:335] reference:rubout c y r b seven eight\n",
+            "[NeMo I 2024-07-07 13:27:28 wer:336] predicted:rubout c y r b seven eight\n",
+            "[NeMo I 2024-07-07 13:27:29 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:29 wer:335] reference:a m s j seven thousand one hundred and eighty six\n",
+            "[NeMo I 2024-07-07 13:27:29 wer:336] predicted:a m s j seven thousand one hundred eighty six\n",
+            "[NeMo I 2024-07-07 13:27:29 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:29 wer:335] reference:one four eight five oh\n",
+            "[NeMo I 2024-07-07 13:27:29 wer:336] predicted:one four eight five oh\n",
+            "[NeMo I 2024-07-07 13:27:30 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:30 wer:335] reference:l i n d a\n",
+            "[NeMo I 2024-07-07 13:27:30 wer:336] predicted:l i n d a\n",
+            "[NeMo I 2024-07-07 13:27:30 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:30 wer:335] reference:eight fourteen sixty four\n",
+            "[NeMo I 2024-07-07 13:27:30 wer:336] predicted:eight fourteen sixty four\n",
+            "[NeMo I 2024-07-07 13:27:31 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:31 wer:335] reference:one five two one seven\n",
+            "[NeMo I 2024-07-07 13:27:31 wer:336] predicted:one five two one seven\n",
+            "[NeMo I 2024-07-07 13:27:32 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:32 wer:335] reference:s t e v e n\n",
+            "[NeMo I 2024-07-07 13:27:32 wer:336] predicted:s t e v e n\n",
+            "[NeMo I 2024-07-07 13:27:32 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:32 wer:335] reference:two four two two nine one two\n",
+            "[NeMo I 2024-07-07 13:27:32 wer:336] predicted:two four two two nine one two\n",
+            "[NeMo I 2024-07-07 13:27:33 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:33 wer:335] reference:m e m o r y l a n e\n",
+            "[NeMo I 2024-07-07 13:27:33 wer:336] predicted:m e m o r y l a n e\n",
+            "[NeMo I 2024-07-07 13:27:33 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:33 wer:335] reference:stop\n",
+            "[NeMo I 2024-07-07 13:27:33 wer:336] predicted:stop\n",
+            "[NeMo I 2024-07-07 13:27:34 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:34 wer:335] reference:t h a y e r\n",
+            "[NeMo I 2024-07-07 13:27:34 wer:336] predicted:t h a y e r\n",
+            "[NeMo I 2024-07-07 13:27:34 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:34 wer:335] reference:p i t t s b u r g h\n",
+            "[NeMo I 2024-07-07 13:27:34 wer:336] predicted:p i t t s b u r g h\n",
+            "[NeMo I 2024-07-07 13:27:35 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:35 wer:335] reference:oh six oh seven six six\n",
+            "[NeMo I 2024-07-07 13:27:35 wer:336] predicted:oh six oh seven six six\n",
+            "[NeMo I 2024-07-07 13:27:35 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:35 wer:335] reference:enter four thousand eight hundred eighty three\n",
+            "[NeMo I 2024-07-07 13:27:35 wer:336] predicted:enter four thousand eight hundred eighty three\n",
+            "[NeMo I 2024-07-07 13:27:36 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:36 wer:335] reference:m o r e w o o d\n",
+            "[NeMo I 2024-07-07 13:27:36 wer:336] predicted:m o r e w o o d\n",
+            "[NeMo I 2024-07-07 13:27:37 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:37 wer:335] reference:p i t t s b u r g h\n",
+            "[NeMo I 2024-07-07 13:27:37 wer:336] predicted:p i t t s b u r g h\n",
+            "[NeMo I 2024-07-07 13:27:37 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:37 wer:335] reference:v f o f h nine four\n",
+            "[NeMo I 2024-07-07 13:27:37 wer:336] predicted:v f o f h nine four\n"
+          ]
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "900e50b1b4e94425967d5fae15ac69b0",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Validation: |          | 0/? [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo I 2024-07-07 13:27:38 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:38 wer:335] reference:rubout g m e f three nine\n",
+            "[NeMo I 2024-07-07 13:27:38 wer:336] predicted:rubout g m e f three nine\n",
+            "[NeMo I 2024-07-07 13:27:38 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:38 wer:335] reference:v a n e s s a\n",
+            "[NeMo I 2024-07-07 13:27:38 wer:336] predicted:v a n e s s a\n",
+            "[NeMo I 2024-07-07 13:27:38 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:38 wer:335] reference:march seven nineteen sixty seven\n",
+            "[NeMo I 2024-07-07 13:27:38 wer:336] predicted:march seven nineteen sixty seven\n",
+            "[NeMo I 2024-07-07 13:27:38 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:38 wer:335] reference:m y e r s\n",
+            "[NeMo I 2024-07-07 13:27:38 wer:336] predicted:m y e r s\n",
+            "[NeMo I 2024-07-07 13:27:38 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:38 wer:335] reference:four one two two six eight four one four two\n",
+            "[NeMo I 2024-07-07 13:27:38 wer:336] predicted:four one two two six eight four one four two\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO: Epoch 5, global step 180: 'val_wer' reached 0.01164 (best 0.01164), saving model to '/workspace/nemo/NeMo-opensource/tutorials/asr/asr_adapters/experiments/ASR-Adapters/2024-07-07_13-24-36/checkpoints/ASR-Adapters--val_wer=0.0116-epoch=5.ckpt' as top 3\n",
+            "I0707 13:27:38.268094 131498730960704 rank_zero.py:64] Epoch 5, global step 180: 'val_wer' reached 0.01164 (best 0.01164), saving model to '/workspace/nemo/NeMo-opensource/tutorials/asr/asr_adapters/experiments/ASR-Adapters/2024-07-07_13-24-36/checkpoints/ASR-Adapters--val_wer=0.0116-epoch=5.ckpt' as top 3\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo I 2024-07-07 13:27:39 nemo_model_checkpoint:217] New best .nemo model saved to: /workspace/nemo/NeMo-opensource/tutorials/asr/asr_adapters/experiments/ASR-Adapters/2024-07-07_13-24-36/checkpoints/ASR-Adapters.nemo\n",
+            "[NeMo I 2024-07-07 13:27:40 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:40 wer:335] reference:four four four seven\n",
+            "[NeMo I 2024-07-07 13:27:40 wer:336] predicted:four four four seven\n",
+            "[NeMo I 2024-07-07 13:27:41 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:41 wer:335] reference:rubout y y f x zero\n",
+            "[NeMo I 2024-07-07 13:27:41 wer:336] predicted:rubout y y f x zero\n",
+            "[NeMo I 2024-07-07 13:27:42 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:42 wer:335] reference:erase v x s x two four nine nine\n",
+            "[NeMo I 2024-07-07 13:27:42 wer:336] predicted:erase v x s x two four nine nine\n",
+            "[NeMo I 2024-07-07 13:27:42 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:42 wer:335] reference:enter eight one nine six\n",
+            "[NeMo I 2024-07-07 13:27:42 wer:336] predicted:enter eight one nine six\n",
+            "[NeMo I 2024-07-07 13:27:43 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:43 wer:335] reference:six eight three five zero four one\n",
+            "[NeMo I 2024-07-07 13:27:43 wer:336] predicted:six eight three five zero four one\n",
+            "[NeMo I 2024-07-07 13:27:43 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:43 wer:335] reference:yes\n",
+            "[NeMo I 2024-07-07 13:27:43 wer:336] predicted:yes\n",
+            "[NeMo I 2024-07-07 13:27:44 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:44 wer:335] reference:go\n",
+            "[NeMo I 2024-07-07 13:27:44 wer:336] predicted:go\n",
+            "[NeMo I 2024-07-07 13:27:44 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:44 wer:335] reference:rubout j u i p three two eight\n",
+            "[NeMo I 2024-07-07 13:27:44 wer:336] predicted:rubout j u i p three two eight\n",
+            "[NeMo I 2024-07-07 13:27:45 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:45 wer:335] reference:may twenty first nineteen sixty\n",
+            "[NeMo I 2024-07-07 13:27:45 wer:336] predicted:may twenty first nineteen sixty\n",
+            "[NeMo I 2024-07-07 13:27:46 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:46 wer:335] reference:c i n d y\n",
+            "[NeMo I 2024-07-07 13:27:46 wer:336] predicted:c i n d y\n",
+            "[NeMo I 2024-07-07 13:27:46 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:46 wer:335] reference:erase f h b k z sixty nine\n",
+            "[NeMo I 2024-07-07 13:27:46 wer:336] predicted:erase f h b k z sixty nine\n",
+            "[NeMo I 2024-07-07 13:27:47 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:47 wer:335] reference:erase j n r g forty eight\n",
+            "[NeMo I 2024-07-07 13:27:47 wer:336] predicted:erase j n r g forty eight\n",
+            "[NeMo I 2024-07-07 13:27:47 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:47 wer:335] reference:erase f h b k z sixty nine\n",
+            "[NeMo I 2024-07-07 13:27:47 wer:336] predicted:erase f h b k z sixty nine\n",
+            "[NeMo I 2024-07-07 13:27:48 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:48 wer:335] reference:v e r o n a\n",
+            "[NeMo I 2024-07-07 13:27:48 wer:336] predicted:v e r o n a\n",
+            "[NeMo I 2024-07-07 13:27:48 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:48 wer:335] reference:enter thirty five\n",
+            "[NeMo I 2024-07-07 13:27:48 wer:336] predicted:enter thirty five\n",
+            "[NeMo I 2024-07-07 13:27:49 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:49 wer:335] reference:enter three eighty six\n",
+            "[NeMo I 2024-07-07 13:27:49 wer:336] predicted:enter three eighty six\n",
+            "[NeMo I 2024-07-07 13:27:50 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:50 wer:335] reference:enter eight twenty seven\n",
+            "[NeMo I 2024-07-07 13:27:50 wer:336] predicted:enter eight twenty seven\n",
+            "[NeMo I 2024-07-07 13:27:50 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:50 wer:335] reference:j o s e p h\n",
+            "[NeMo I 2024-07-07 13:27:50 wer:336] predicted:j o s e p h\n"
+          ]
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "9c263eddd5b94a4985e396d8846ad93c",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Validation: |          | 0/? [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo I 2024-07-07 13:27:50 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:50 wer:335] reference:rubout g m e f three nine\n",
+            "[NeMo I 2024-07-07 13:27:50 wer:336] predicted:rubout g m e f three nine\n",
+            "[NeMo I 2024-07-07 13:27:51 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:51 wer:335] reference:v a n e s s a\n",
+            "[NeMo I 2024-07-07 13:27:51 wer:336] predicted:v a n e s s a\n",
+            "[NeMo I 2024-07-07 13:27:51 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:51 wer:335] reference:march seven nineteen sixty seven\n",
+            "[NeMo I 2024-07-07 13:27:51 wer:336] predicted:march seven nineteen sixty seven\n",
+            "[NeMo I 2024-07-07 13:27:51 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:51 wer:335] reference:m y e r s\n",
+            "[NeMo I 2024-07-07 13:27:51 wer:336] predicted:m y e r s\n",
+            "[NeMo I 2024-07-07 13:27:51 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:51 wer:335] reference:four one two two six eight four one four two\n",
+            "[NeMo I 2024-07-07 13:27:51 wer:336] predicted:four one two two six eight four one four two\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO: Epoch 8, global step 270: 'val_wer' reached 0.00776 (best 0.00776), saving model to '/workspace/nemo/NeMo-opensource/tutorials/asr/asr_adapters/experiments/ASR-Adapters/2024-07-07_13-24-36/checkpoints/ASR-Adapters--val_wer=0.0078-epoch=8.ckpt' as top 3\n",
+            "I0707 13:27:51.216756 131498730960704 rank_zero.py:64] Epoch 8, global step 270: 'val_wer' reached 0.00776 (best 0.00776), saving model to '/workspace/nemo/NeMo-opensource/tutorials/asr/asr_adapters/experiments/ASR-Adapters/2024-07-07_13-24-36/checkpoints/ASR-Adapters--val_wer=0.0078-epoch=8.ckpt' as top 3\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo I 2024-07-07 13:27:52 nemo_model_checkpoint:217] New best .nemo model saved to: /workspace/nemo/NeMo-opensource/tutorials/asr/asr_adapters/experiments/ASR-Adapters/2024-07-07_13-24-36/checkpoints/ASR-Adapters.nemo\n",
+            "[NeMo I 2024-07-07 13:27:53 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:53 wer:335] reference:enter eight one nine six\n",
+            "[NeMo I 2024-07-07 13:27:53 wer:336] predicted:enter eight one nine six\n",
+            "[NeMo I 2024-07-07 13:27:54 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:54 wer:335] reference:p i t t s b u r g h\n",
+            "[NeMo I 2024-07-07 13:27:54 wer:336] predicted:p i t t s b u r g h\n",
+            "[NeMo I 2024-07-07 13:27:54 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:54 wer:335] reference:a r a p a h o e\n",
+            "[NeMo I 2024-07-07 13:27:54 wer:336] predicted:a r a p a h o e\n",
+            "[NeMo I 2024-07-07 13:27:55 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:55 wer:335] reference:five two one four nine five four\n",
+            "[NeMo I 2024-07-07 13:27:55 wer:336] predicted:five two one four nine five four\n",
+            "[NeMo I 2024-07-07 13:27:56 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:56 wer:335] reference:six seventeen thirty six\n",
+            "[NeMo I 2024-07-07 13:27:56 wer:336] predicted:six seventeen thirty six\n",
+            "[NeMo I 2024-07-07 13:27:56 wer:334] \n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:27:56 wer:335] reference:v e r o n a\n",
+            "[NeMo I 2024-07-07 13:27:56 wer:336] predicted:v e r o n a\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO: `Trainer.fit` stopped: `max_steps=300` reached.\n",
+            "I0707 13:27:56.647670 131498730960704 rank_zero.py:64] `Trainer.fit` stopped: `max_steps=300` reached.\n",
+            "INFO: Restoring states from the checkpoint path at /workspace/nemo/NeMo-opensource/tutorials/asr/asr_adapters/experiments/ASR-Adapters/2024-07-07_13-24-36/checkpoints/ASR-Adapters--val_wer=0.0078-epoch=8.ckpt\n",
+            "I0707 13:27:57.032257 131498730960704 rank_zero.py:64] Restoring states from the checkpoint path at /workspace/nemo/NeMo-opensource/tutorials/asr/asr_adapters/experiments/ASR-Adapters/2024-07-07_13-24-36/checkpoints/ASR-Adapters--val_wer=0.0078-epoch=8.ckpt\n",
+            "INFO: Restored all states from the checkpoint at /workspace/nemo/NeMo-opensource/tutorials/asr/asr_adapters/experiments/ASR-Adapters/2024-07-07_13-24-36/checkpoints/ASR-Adapters--val_wer=0.0078-epoch=8.ckpt\n",
+            "I0707 13:27:57.202682 131498730960704 rank_zero.py:64] Restored all states from the checkpoint at /workspace/nemo/NeMo-opensource/tutorials/asr/asr_adapters/experiments/ASR-Adapters/2024-07-07_13-24-36/checkpoints/ASR-Adapters--val_wer=0.0078-epoch=8.ckpt\n"
           ]
         }
       ],
@@ -1638,7 +2082,16 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 32,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "trainer.train_dataloader.num_workers = 0"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 34,
       "metadata": {
         "id": "7tDdE9lZbvhJ"
       },
@@ -1649,7 +2102,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 35,
       "metadata": {
         "id": "L9yO-M-oL3Cy"
       },
@@ -1680,11 +2133,314 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 49,
       "metadata": {
         "id": "_Ps6_45mdJpM"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo W 2024-07-07 13:40:21 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py:119: UserWarning: Future Hydra versions will no longer change working directory at job runtime by default.\n",
+            "    See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.\n",
+            "      ret = run_job(\n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:40:21 transcribe_speech:222] Hydra config: model_path: /content/adapted_model.nemo\n",
+            "    pretrained_name: null\n",
+            "    audio_dir: null\n",
+            "    dataset_manifest: datasets/an4/test_manifest.json\n",
+            "    channel_selector: null\n",
+            "    audio_key: audio_filepath\n",
+            "    eval_config_yaml: null\n",
+            "    presort_manifest: true\n",
+            "    output_filename: /content/adapted_predictions.json\n",
+            "    batch_size: 32\n",
+            "    num_workers: 0\n",
+            "    append_pred: false\n",
+            "    pred_name_postfix: null\n",
+            "    random_seed: null\n",
+            "    compute_timestamps: false\n",
+            "    preserve_alignment: false\n",
+            "    compute_langs: false\n",
+            "    cuda: null\n",
+            "    allow_mps: false\n",
+            "    amp: false\n",
+            "    amp_dtype: float16\n",
+            "    compute_dtype: float32\n",
+            "    matmul_precision: highest\n",
+            "    audio_type: wav\n",
+            "    overwrite_transcripts: true\n",
+            "    ctc_decoding:\n",
+            "      strategy: greedy\n",
+            "      preserve_alignments: null\n",
+            "      compute_timestamps: null\n",
+            "      word_seperator: ' '\n",
+            "      ctc_timestamp_type: all\n",
+            "      batch_dim_index: 0\n",
+            "      greedy:\n",
+            "        preserve_alignments: false\n",
+            "        compute_timestamps: false\n",
+            "        preserve_frame_confidence: false\n",
+            "        confidence_method_cfg:\n",
+            "          name: entropy\n",
+            "          entropy_type: tsallis\n",
+            "          alpha: 0.33\n",
+            "          entropy_norm: exp\n",
+            "          temperature: DEPRECATED\n",
+            "      beam:\n",
+            "        beam_size: 4\n",
+            "        search_type: default\n",
+            "        preserve_alignments: false\n",
+            "        compute_timestamps: false\n",
+            "        return_best_hypothesis: true\n",
+            "        beam_alpha: 1.0\n",
+            "        beam_beta: 0.0\n",
+            "        kenlm_path: null\n",
+            "        flashlight_cfg:\n",
+            "          lexicon_path: null\n",
+            "          boost_path: null\n",
+            "          beam_size_token: 16\n",
+            "          beam_threshold: 20.0\n",
+            "          unk_weight: -.inf\n",
+            "          sil_weight: 0.0\n",
+            "        pyctcdecode_cfg:\n",
+            "          beam_prune_logp: -10.0\n",
+            "          token_min_logp: -5.0\n",
+            "          prune_history: false\n",
+            "          hotwords: null\n",
+            "          hotword_weight: 10.0\n",
+            "      confidence_cfg:\n",
+            "        preserve_frame_confidence: false\n",
+            "        preserve_token_confidence: false\n",
+            "        preserve_word_confidence: false\n",
+            "        exclude_blank: true\n",
+            "        aggregation: min\n",
+            "        tdt_include_duration: false\n",
+            "        method_cfg:\n",
+            "          name: entropy\n",
+            "          entropy_type: tsallis\n",
+            "          alpha: 0.33\n",
+            "          entropy_norm: exp\n",
+            "          temperature: DEPRECATED\n",
+            "      temperature: 1.0\n",
+            "    rnnt_decoding:\n",
+            "      model_type: rnnt\n",
+            "      strategy: greedy_batch\n",
+            "      compute_hypothesis_token_set: false\n",
+            "      preserve_alignments: null\n",
+            "      confidence_cfg:\n",
+            "        preserve_frame_confidence: false\n",
+            "        preserve_token_confidence: false\n",
+            "        preserve_word_confidence: false\n",
+            "        exclude_blank: true\n",
+            "        aggregation: min\n",
+            "        tdt_include_duration: false\n",
+            "        method_cfg:\n",
+            "          name: entropy\n",
+            "          entropy_type: tsallis\n",
+            "          alpha: 0.33\n",
+            "          entropy_norm: exp\n",
+            "          temperature: DEPRECATED\n",
+            "      fused_batch_size: -1\n",
+            "      compute_timestamps: null\n",
+            "      compute_langs: false\n",
+            "      word_seperator: ' '\n",
+            "      rnnt_timestamp_type: all\n",
+            "      greedy:\n",
+            "        max_symbols_per_step: 10\n",
+            "        preserve_alignments: false\n",
+            "        preserve_frame_confidence: false\n",
+            "        tdt_include_duration_confidence: false\n",
+            "        confidence_method_cfg:\n",
+            "          name: entropy\n",
+            "          entropy_type: tsallis\n",
+            "          alpha: 0.33\n",
+            "          entropy_norm: exp\n",
+            "          temperature: DEPRECATED\n",
+            "        loop_labels: true\n",
+            "        use_cuda_graph_decoder: true\n",
+            "      beam:\n",
+            "        beam_size: 4\n",
+            "        search_type: default\n",
+            "        score_norm: true\n",
+            "        return_best_hypothesis: true\n",
+            "        tsd_max_sym_exp_per_step: 50\n",
+            "        alsd_max_target_len: 1.0\n",
+            "        nsc_max_timesteps_expansion: 1\n",
+            "        nsc_prefix_alpha: 1\n",
+            "        maes_num_steps: 2\n",
+            "        maes_prefix_alpha: 1\n",
+            "        maes_expansion_gamma: 2.3\n",
+            "        maes_expansion_beta: 2\n",
+            "        language_model: null\n",
+            "        softmax_temperature: 1.0\n",
+            "        preserve_alignments: false\n",
+            "        ngram_lm_model: null\n",
+            "        ngram_lm_alpha: 0.0\n",
+            "        hat_subtract_ilm: false\n",
+            "        hat_ilm_weight: 0.0\n",
+            "      temperature: 1.0\n",
+            "      durations: []\n",
+            "      big_blank_durations: []\n",
+            "    multitask_decoding:\n",
+            "      strategy: beam\n",
+            "      compute_hypothesis_token_set: false\n",
+            "      preserve_alignments: null\n",
+            "      compute_langs: false\n",
+            "      beam:\n",
+            "        beam_size: 1\n",
+            "        search_type: default\n",
+            "        len_pen: 1.0\n",
+            "        max_generation_delta: -1\n",
+            "        return_best_hypothesis: true\n",
+            "        preserve_alignments: false\n",
+            "      temperature: 1.0\n",
+            "    prompt: {}\n",
+            "    decoder_type: null\n",
+            "    att_context_size: null\n",
+            "    model_change:\n",
+            "      conformer:\n",
+            "        self_attention_model: null\n",
+            "        att_context_size: null\n",
+            "    calculate_wer: true\n",
+            "    clean_groundtruth_text: false\n",
+            "    langid: en\n",
+            "    use_cer: false\n",
+            "    return_transcriptions: false\n",
+            "    return_hypotheses: true\n",
+            "    gt_text_attr_name: text\n",
+            "    gt_lang_attr_name: lang\n",
+            "    allow_partial_transcribe: false\n",
+            "    extract_nbest: false\n",
+            "    calculate_rtfx: false\n",
+            "    use_punct_er: false\n",
+            "    tolerance: null\n",
+            "    only_score_manifest: false\n",
+            "    scores_per_sample: false\n",
+            "    text_processing:\n",
+            "      punctuation_marks: .,?\n",
+            "      do_lowercase: false\n",
+            "      rm_punctuation: false\n",
+            "      separate_punctuation: false\n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:40:21 transcribe_speech:269] Inference will be done on device: cuda:0\n",
+            "[NeMo I 2024-07-07 13:40:21 transcribe_utils:250] Restoring model : EncDecCTCModelBPE\n",
+            "[NeMo I 2024-07-07 13:40:21 mixins:172] Tokenizer SentencePieceTokenizer initialized with 1024 tokens\n",
+            "[NeMo W 2024-07-07 13:40:21 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
+            "    Train config : \n",
+            "    manifest_filepath: datasets/an4/train_manifest.json\n",
+            "    sample_rate: 16000\n",
+            "    batch_size: 32\n",
+            "    shuffle: true\n",
+            "    num_workers: 8\n",
+            "    pin_memory: true\n",
+            "    use_start_end_token: false\n",
+            "    trim_silence: false\n",
+            "    max_duration: 20.0\n",
+            "    min_duration: 0.1\n",
+            "    shuffle_n: 2048\n",
+            "    is_tarred: false\n",
+            "    tarred_audio_filepaths: null\n",
+            "    \n",
+            "[NeMo W 2024-07-07 13:40:21 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
+            "    Validation config : \n",
+            "    manifest_filepath: datasets/an4/test_manifest.json\n",
+            "    sample_rate: 16000\n",
+            "    batch_size: 32\n",
+            "    shuffle: false\n",
+            "    num_workers: 8\n",
+            "    pin_memory: true\n",
+            "    use_start_end_token: false\n",
+            "    is_tarred: false\n",
+            "    tarred_audio_filepaths: na\n",
+            "    \n",
+            "[NeMo W 2024-07-07 13:40:21 modelPT:189] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a valid configuration file to setup the test data loader(s).\n",
+            "    Test config : \n",
+            "    manifest_filepath: datasets/an4/test_manifest.json\n",
+            "    sample_rate: 16000\n",
+            "    batch_size: 32\n",
+            "    shuffle: false\n",
+            "    num_workers: 8\n",
+            "    pin_memory: true\n",
+            "    use_start_end_token: false\n",
+            "    is_tarred: false\n",
+            "    tarred_audio_filepaths: na\n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:40:21 features:305] PADDING: 0\n",
+            "[NeMo I 2024-07-07 13:40:22 adapter_mixins:612] Finished setup of adapter : 'AN4'. Enabled: True.\n",
+            "[NeMo I 2024-07-07 13:40:22 save_restore_connector:263] Model EncDecCTCModelBPE was successfully restored from /content/adapted_model.nemo.\n",
+            "GPU available: True (cuda), used: True\n",
+            "TPU available: False, using: 0 TPU cores\n",
+            "HPU available: False, using: 0 HPUs\n",
+            "[NeMo I 2024-07-07 13:40:22 ctc_bpe_models:342] Changed decoding strategy to \n",
+            "    strategy: greedy\n",
+            "    preserve_alignments: null\n",
+            "    compute_timestamps: false\n",
+            "    word_seperator: ' '\n",
+            "    ctc_timestamp_type: all\n",
+            "    batch_dim_index: 0\n",
+            "    greedy:\n",
+            "      preserve_alignments: false\n",
+            "      compute_timestamps: false\n",
+            "      preserve_frame_confidence: false\n",
+            "      confidence_method_cfg:\n",
+            "        name: entropy\n",
+            "        entropy_type: tsallis\n",
+            "        alpha: 0.33\n",
+            "        entropy_norm: exp\n",
+            "        temperature: DEPRECATED\n",
+            "    beam:\n",
+            "      beam_size: 4\n",
+            "      search_type: default\n",
+            "      preserve_alignments: false\n",
+            "      compute_timestamps: false\n",
+            "      return_best_hypothesis: true\n",
+            "      beam_alpha: 1.0\n",
+            "      beam_beta: 0.0\n",
+            "      kenlm_path: null\n",
+            "      flashlight_cfg:\n",
+            "        lexicon_path: null\n",
+            "        boost_path: null\n",
+            "        beam_size_token: 16\n",
+            "        beam_threshold: 20.0\n",
+            "        unk_weight: -.inf\n",
+            "        sil_weight: 0.0\n",
+            "      pyctcdecode_cfg:\n",
+            "        beam_prune_logp: -10.0\n",
+            "        token_min_logp: -5.0\n",
+            "        prune_history: false\n",
+            "        hotwords: null\n",
+            "        hotword_weight: 10.0\n",
+            "    confidence_cfg:\n",
+            "      preserve_frame_confidence: false\n",
+            "      preserve_token_confidence: false\n",
+            "      preserve_word_confidence: false\n",
+            "      exclude_blank: true\n",
+            "      aggregation: min\n",
+            "      tdt_include_duration: false\n",
+            "      method_cfg:\n",
+            "        name: entropy\n",
+            "        entropy_type: tsallis\n",
+            "        alpha: 0.33\n",
+            "        entropy_norm: exp\n",
+            "        temperature: DEPRECATED\n",
+            "    temperature: 1.0\n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:40:22 transcribe_utils:293] \n",
+            "    Transcribing 130 files...\n",
+            "    \n",
+            "Transcribing: 100%|███████████████████████████████| 5/5 [00:01<00:00,  4.37it/s]\n",
+            "[NeMo I 2024-07-07 13:40:23 transcribe_speech:460] Finished transcribing from manifest file: datasets/an4/test_manifest.json\n",
+            "[NeMo I 2024-07-07 13:40:23 transcribe_speech:465] Writing transcriptions into file: /content/adapted_predictions.json\n",
+            "[NeMo I 2024-07-07 13:40:23 transcribe_speech:488] Finished writing predictions to /content/adapted_predictions.json!\n",
+            "[NeMo I 2024-07-07 13:40:23 transcribe_speech:506] Writing prediction and error rate of each sample to /content/adapted_predictions.json!\n",
+            "[NeMo I 2024-07-07 13:40:23 transcribe_speech:507] {'samples': 130, 'tokens': 773, 'wer': 0.007761966364812419, 'ins_rate': 0.0, 'del_rate': 0.00129366106080207, 'sub_rate': 0.00646830530401035}\n",
+            "[NeMo I 2024-07-07 13:40:23 speech_to_text_eval:133] Finished transcribing speech dataset. Computing ASR metrics..\n",
+            "[NeMo I 2024-07-07 13:40:23 speech_to_text_eval:212] Dataset WER/CER 0.78%/0.31%\n"
+          ]
+        }
+      ],
       "source": [
         "!python scripts/speech_to_text_eval.py \\\n",
         "  model_path=\"/content/adapted_model.nemo\" \\\n",
@@ -1718,11 +2474,26 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 50,
       "metadata": {
         "id": "vlK3PdMhtlv1"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Original\n",
+            "head: cannot open '/content/unadapted_predictions.json' for reading: No such file or directory\n",
+            "Adapted\n",
+            "{\"audio_filepath\": \"/workspace/nemo/NeMo-opensource/tutorials/asr/asr_adapters/datasets/an4/wav/an4test_clstk/fcaw/an406-fcaw-b.wav\", \"duration\": 4.0, \"text\": \"rubout g m e f three nine\", \"pred_text\": \"rubout g m e f three nine\", \"wer\": 0.0, \"tokens\": 7, \"ins_rate\": 0.0, \"del_rate\": 0.0, \"sub_rate\": 0.0}\n",
+            "{\"audio_filepath\": \"/workspace/nemo/NeMo-opensource/tutorials/asr/asr_adapters/datasets/an4/wav/an4test_clstk/fcaw/an407-fcaw-b.wav\", \"duration\": 4.1, \"text\": \"erase c q q f seven\", \"pred_text\": \"erase c q q f seven\", \"wer\": 0.0, \"tokens\": 6, \"ins_rate\": 0.0, \"del_rate\": 0.0, \"sub_rate\": 0.0}\n",
+            "{\"audio_filepath\": \"/workspace/nemo/NeMo-opensource/tutorials/asr/asr_adapters/datasets/an4/wav/an4test_clstk/fcaw/an408-fcaw-b.wav\", \"duration\": 3.3, \"text\": \"b a o z five three\", \"pred_text\": \"b a o z five three\", \"wer\": 0.0, \"tokens\": 6, \"ins_rate\": 0.0, \"del_rate\": 0.0, \"sub_rate\": 0.0}\n",
+            "{\"audio_filepath\": \"/workspace/nemo/NeMo-opensource/tutorials/asr/asr_adapters/datasets/an4/wav/an4test_clstk/fcaw/an409-fcaw-b.wav\", \"duration\": 0.8, \"text\": \"go\", \"pred_text\": \"go\", \"wer\": 0.0, \"tokens\": 1, \"ins_rate\": 0.0, \"del_rate\": 0.0, \"sub_rate\": 0.0}\n",
+            "{\"audio_filepath\": \"/workspace/nemo/NeMo-opensource/tutorials/asr/asr_adapters/datasets/an4/wav/an4test_clstk/fcaw/an410-fcaw-b.wav\", \"duration\": 3.6, \"text\": \"rubout n i m n one\", \"pred_text\": \"rubout n i m n one\", \"wer\": 0.0, \"tokens\": 6, \"ins_rate\": 0.0, \"del_rate\": 0.0, \"sub_rate\": 0.0}\n"
+          ]
+        }
+      ],
       "source": [
         "print(\"Original\")\n",
         "!head -n 5 /content/unadapted_predictions.json\n",
@@ -1743,11 +2514,19 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 51,
       "metadata": {
         "id": "1R6wHGgRdRKX"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo I 2024-07-07 13:40:43 adapter_mixins:719] Setting adapter 'AN4' status : Enabled = False\n"
+          ]
+        }
+      ],
       "source": [
         "model.set_enabled_adapters(enabled=False)\n",
         "model.save_to(\"/content/adapter_disabled_model.nemo\")"
@@ -1755,11 +2534,314 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 52,
       "metadata": {
         "id": "IhGLtRwdeGRf"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo W 2024-07-07 13:40:58 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py:119: UserWarning: Future Hydra versions will no longer change working directory at job runtime by default.\n",
+            "    See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.\n",
+            "      ret = run_job(\n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:40:58 transcribe_speech:222] Hydra config: model_path: /content/adapter_disabled_model.nemo\n",
+            "    pretrained_name: null\n",
+            "    audio_dir: null\n",
+            "    dataset_manifest: datasets/an4/test_manifest.json\n",
+            "    channel_selector: null\n",
+            "    audio_key: audio_filepath\n",
+            "    eval_config_yaml: null\n",
+            "    presort_manifest: true\n",
+            "    output_filename: /content/adapter_disabled_predictions.json\n",
+            "    batch_size: 32\n",
+            "    num_workers: 0\n",
+            "    append_pred: false\n",
+            "    pred_name_postfix: null\n",
+            "    random_seed: null\n",
+            "    compute_timestamps: false\n",
+            "    preserve_alignment: false\n",
+            "    compute_langs: false\n",
+            "    cuda: null\n",
+            "    allow_mps: false\n",
+            "    amp: false\n",
+            "    amp_dtype: float16\n",
+            "    compute_dtype: float32\n",
+            "    matmul_precision: highest\n",
+            "    audio_type: wav\n",
+            "    overwrite_transcripts: true\n",
+            "    ctc_decoding:\n",
+            "      strategy: greedy\n",
+            "      preserve_alignments: null\n",
+            "      compute_timestamps: null\n",
+            "      word_seperator: ' '\n",
+            "      ctc_timestamp_type: all\n",
+            "      batch_dim_index: 0\n",
+            "      greedy:\n",
+            "        preserve_alignments: false\n",
+            "        compute_timestamps: false\n",
+            "        preserve_frame_confidence: false\n",
+            "        confidence_method_cfg:\n",
+            "          name: entropy\n",
+            "          entropy_type: tsallis\n",
+            "          alpha: 0.33\n",
+            "          entropy_norm: exp\n",
+            "          temperature: DEPRECATED\n",
+            "      beam:\n",
+            "        beam_size: 4\n",
+            "        search_type: default\n",
+            "        preserve_alignments: false\n",
+            "        compute_timestamps: false\n",
+            "        return_best_hypothesis: true\n",
+            "        beam_alpha: 1.0\n",
+            "        beam_beta: 0.0\n",
+            "        kenlm_path: null\n",
+            "        flashlight_cfg:\n",
+            "          lexicon_path: null\n",
+            "          boost_path: null\n",
+            "          beam_size_token: 16\n",
+            "          beam_threshold: 20.0\n",
+            "          unk_weight: -.inf\n",
+            "          sil_weight: 0.0\n",
+            "        pyctcdecode_cfg:\n",
+            "          beam_prune_logp: -10.0\n",
+            "          token_min_logp: -5.0\n",
+            "          prune_history: false\n",
+            "          hotwords: null\n",
+            "          hotword_weight: 10.0\n",
+            "      confidence_cfg:\n",
+            "        preserve_frame_confidence: false\n",
+            "        preserve_token_confidence: false\n",
+            "        preserve_word_confidence: false\n",
+            "        exclude_blank: true\n",
+            "        aggregation: min\n",
+            "        tdt_include_duration: false\n",
+            "        method_cfg:\n",
+            "          name: entropy\n",
+            "          entropy_type: tsallis\n",
+            "          alpha: 0.33\n",
+            "          entropy_norm: exp\n",
+            "          temperature: DEPRECATED\n",
+            "      temperature: 1.0\n",
+            "    rnnt_decoding:\n",
+            "      model_type: rnnt\n",
+            "      strategy: greedy_batch\n",
+            "      compute_hypothesis_token_set: false\n",
+            "      preserve_alignments: null\n",
+            "      confidence_cfg:\n",
+            "        preserve_frame_confidence: false\n",
+            "        preserve_token_confidence: false\n",
+            "        preserve_word_confidence: false\n",
+            "        exclude_blank: true\n",
+            "        aggregation: min\n",
+            "        tdt_include_duration: false\n",
+            "        method_cfg:\n",
+            "          name: entropy\n",
+            "          entropy_type: tsallis\n",
+            "          alpha: 0.33\n",
+            "          entropy_norm: exp\n",
+            "          temperature: DEPRECATED\n",
+            "      fused_batch_size: -1\n",
+            "      compute_timestamps: null\n",
+            "      compute_langs: false\n",
+            "      word_seperator: ' '\n",
+            "      rnnt_timestamp_type: all\n",
+            "      greedy:\n",
+            "        max_symbols_per_step: 10\n",
+            "        preserve_alignments: false\n",
+            "        preserve_frame_confidence: false\n",
+            "        tdt_include_duration_confidence: false\n",
+            "        confidence_method_cfg:\n",
+            "          name: entropy\n",
+            "          entropy_type: tsallis\n",
+            "          alpha: 0.33\n",
+            "          entropy_norm: exp\n",
+            "          temperature: DEPRECATED\n",
+            "        loop_labels: true\n",
+            "        use_cuda_graph_decoder: true\n",
+            "      beam:\n",
+            "        beam_size: 4\n",
+            "        search_type: default\n",
+            "        score_norm: true\n",
+            "        return_best_hypothesis: true\n",
+            "        tsd_max_sym_exp_per_step: 50\n",
+            "        alsd_max_target_len: 1.0\n",
+            "        nsc_max_timesteps_expansion: 1\n",
+            "        nsc_prefix_alpha: 1\n",
+            "        maes_num_steps: 2\n",
+            "        maes_prefix_alpha: 1\n",
+            "        maes_expansion_gamma: 2.3\n",
+            "        maes_expansion_beta: 2\n",
+            "        language_model: null\n",
+            "        softmax_temperature: 1.0\n",
+            "        preserve_alignments: false\n",
+            "        ngram_lm_model: null\n",
+            "        ngram_lm_alpha: 0.0\n",
+            "        hat_subtract_ilm: false\n",
+            "        hat_ilm_weight: 0.0\n",
+            "      temperature: 1.0\n",
+            "      durations: []\n",
+            "      big_blank_durations: []\n",
+            "    multitask_decoding:\n",
+            "      strategy: beam\n",
+            "      compute_hypothesis_token_set: false\n",
+            "      preserve_alignments: null\n",
+            "      compute_langs: false\n",
+            "      beam:\n",
+            "        beam_size: 1\n",
+            "        search_type: default\n",
+            "        len_pen: 1.0\n",
+            "        max_generation_delta: -1\n",
+            "        return_best_hypothesis: true\n",
+            "        preserve_alignments: false\n",
+            "      temperature: 1.0\n",
+            "    prompt: {}\n",
+            "    decoder_type: null\n",
+            "    att_context_size: null\n",
+            "    model_change:\n",
+            "      conformer:\n",
+            "        self_attention_model: null\n",
+            "        att_context_size: null\n",
+            "    calculate_wer: true\n",
+            "    clean_groundtruth_text: false\n",
+            "    langid: en\n",
+            "    use_cer: false\n",
+            "    return_transcriptions: false\n",
+            "    return_hypotheses: true\n",
+            "    gt_text_attr_name: text\n",
+            "    gt_lang_attr_name: lang\n",
+            "    allow_partial_transcribe: false\n",
+            "    extract_nbest: false\n",
+            "    calculate_rtfx: false\n",
+            "    use_punct_er: false\n",
+            "    tolerance: null\n",
+            "    only_score_manifest: false\n",
+            "    scores_per_sample: false\n",
+            "    text_processing:\n",
+            "      punctuation_marks: .,?\n",
+            "      do_lowercase: false\n",
+            "      rm_punctuation: false\n",
+            "      separate_punctuation: false\n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:40:58 transcribe_speech:269] Inference will be done on device: cuda:0\n",
+            "[NeMo I 2024-07-07 13:40:58 transcribe_utils:250] Restoring model : EncDecCTCModelBPE\n",
+            "[NeMo I 2024-07-07 13:40:59 mixins:172] Tokenizer SentencePieceTokenizer initialized with 1024 tokens\n",
+            "[NeMo W 2024-07-07 13:40:59 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
+            "    Train config : \n",
+            "    manifest_filepath: datasets/an4/train_manifest.json\n",
+            "    sample_rate: 16000\n",
+            "    batch_size: 32\n",
+            "    shuffle: true\n",
+            "    num_workers: 8\n",
+            "    pin_memory: true\n",
+            "    use_start_end_token: false\n",
+            "    trim_silence: false\n",
+            "    max_duration: 20.0\n",
+            "    min_duration: 0.1\n",
+            "    shuffle_n: 2048\n",
+            "    is_tarred: false\n",
+            "    tarred_audio_filepaths: null\n",
+            "    \n",
+            "[NeMo W 2024-07-07 13:40:59 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
+            "    Validation config : \n",
+            "    manifest_filepath: datasets/an4/test_manifest.json\n",
+            "    sample_rate: 16000\n",
+            "    batch_size: 32\n",
+            "    shuffle: false\n",
+            "    num_workers: 8\n",
+            "    pin_memory: true\n",
+            "    use_start_end_token: false\n",
+            "    is_tarred: false\n",
+            "    tarred_audio_filepaths: na\n",
+            "    \n",
+            "[NeMo W 2024-07-07 13:40:59 modelPT:189] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a valid configuration file to setup the test data loader(s).\n",
+            "    Test config : \n",
+            "    manifest_filepath: datasets/an4/test_manifest.json\n",
+            "    sample_rate: 16000\n",
+            "    batch_size: 32\n",
+            "    shuffle: false\n",
+            "    num_workers: 8\n",
+            "    pin_memory: true\n",
+            "    use_start_end_token: false\n",
+            "    is_tarred: false\n",
+            "    tarred_audio_filepaths: na\n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:40:59 features:305] PADDING: 0\n",
+            "[NeMo I 2024-07-07 13:40:59 adapter_mixins:612] Finished setup of adapter : 'AN4'. Enabled: False.\n",
+            "[NeMo I 2024-07-07 13:41:00 save_restore_connector:263] Model EncDecCTCModelBPE was successfully restored from /content/adapter_disabled_model.nemo.\n",
+            "GPU available: True (cuda), used: True\n",
+            "TPU available: False, using: 0 TPU cores\n",
+            "HPU available: False, using: 0 HPUs\n",
+            "[NeMo I 2024-07-07 13:41:00 ctc_bpe_models:342] Changed decoding strategy to \n",
+            "    strategy: greedy\n",
+            "    preserve_alignments: null\n",
+            "    compute_timestamps: false\n",
+            "    word_seperator: ' '\n",
+            "    ctc_timestamp_type: all\n",
+            "    batch_dim_index: 0\n",
+            "    greedy:\n",
+            "      preserve_alignments: false\n",
+            "      compute_timestamps: false\n",
+            "      preserve_frame_confidence: false\n",
+            "      confidence_method_cfg:\n",
+            "        name: entropy\n",
+            "        entropy_type: tsallis\n",
+            "        alpha: 0.33\n",
+            "        entropy_norm: exp\n",
+            "        temperature: DEPRECATED\n",
+            "    beam:\n",
+            "      beam_size: 4\n",
+            "      search_type: default\n",
+            "      preserve_alignments: false\n",
+            "      compute_timestamps: false\n",
+            "      return_best_hypothesis: true\n",
+            "      beam_alpha: 1.0\n",
+            "      beam_beta: 0.0\n",
+            "      kenlm_path: null\n",
+            "      flashlight_cfg:\n",
+            "        lexicon_path: null\n",
+            "        boost_path: null\n",
+            "        beam_size_token: 16\n",
+            "        beam_threshold: 20.0\n",
+            "        unk_weight: -.inf\n",
+            "        sil_weight: 0.0\n",
+            "      pyctcdecode_cfg:\n",
+            "        beam_prune_logp: -10.0\n",
+            "        token_min_logp: -5.0\n",
+            "        prune_history: false\n",
+            "        hotwords: null\n",
+            "        hotword_weight: 10.0\n",
+            "    confidence_cfg:\n",
+            "      preserve_frame_confidence: false\n",
+            "      preserve_token_confidence: false\n",
+            "      preserve_word_confidence: false\n",
+            "      exclude_blank: true\n",
+            "      aggregation: min\n",
+            "      tdt_include_duration: false\n",
+            "      method_cfg:\n",
+            "        name: entropy\n",
+            "        entropy_type: tsallis\n",
+            "        alpha: 0.33\n",
+            "        entropy_norm: exp\n",
+            "        temperature: DEPRECATED\n",
+            "    temperature: 1.0\n",
+            "    \n",
+            "[NeMo I 2024-07-07 13:41:00 transcribe_utils:293] \n",
+            "    Transcribing 130 files...\n",
+            "    \n",
+            "Transcribing: 100%|███████████████████████████████| 5/5 [00:00<00:00,  5.23it/s]\n",
+            "[NeMo I 2024-07-07 13:41:01 transcribe_speech:460] Finished transcribing from manifest file: datasets/an4/test_manifest.json\n",
+            "[NeMo I 2024-07-07 13:41:01 transcribe_speech:465] Writing transcriptions into file: /content/adapter_disabled_predictions.json\n",
+            "[NeMo I 2024-07-07 13:41:01 transcribe_speech:488] Finished writing predictions to /content/adapter_disabled_predictions.json!\n",
+            "[NeMo I 2024-07-07 13:41:01 transcribe_speech:506] Writing prediction and error rate of each sample to /content/adapter_disabled_predictions.json!\n",
+            "[NeMo I 2024-07-07 13:41:01 transcribe_speech:507] {'samples': 130, 'tokens': 773, 'wer': 0.054333764553686936, 'ins_rate': 0.007761966364812419, 'del_rate': 0.00646830530401035, 'sub_rate': 0.040103492884864166}\n",
+            "[NeMo I 2024-07-07 13:41:01 speech_to_text_eval:133] Finished transcribing speech dataset. Computing ASR metrics..\n",
+            "[NeMo I 2024-07-07 13:41:01 speech_to_text_eval:212] Dataset WER/CER 5.43%/1.83%\n"
+          ]
+        }
+      ],
       "source": [
         "!python scripts/speech_to_text_eval.py \\\n",
         "  model_path=\"/content/adapter_disabled_model.nemo\" \\\n",
@@ -1782,11 +2864,28 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 53,
       "metadata": {
         "id": "YFKN7QYuvBzP"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "ename": "Exception",
+          "evalue": "Manifest file could not be opened: <class 'nemo.utils.data_utils.DataStoreObject'>: store_path=/content/unadapted_predictions.json, local_path=/content/unadapted_predictions.json",
+          "output_type": "error",
+          "traceback": [
+            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+            "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+            "File \u001b[0;32m/workspace/nemo/NeMo-opensource/nemo/collections/asr/parts/utils/manifest_utils.py:477\u001b[0m, in \u001b[0;36mread_manifest\u001b[0;34m(manifest)\u001b[0m\n\u001b[1;32m    476\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 477\u001b[0m     f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mmanifest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mutf-8\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    478\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m:\n",
+            "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/content/unadapted_predictions.json'",
+            "\nDuring handling of the above exception, another exception occurred:\n",
+            "\u001b[0;31mException\u001b[0m                                 Traceback (most recent call last)",
+            "Cell \u001b[0;32mIn[53], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m original_transcripts \u001b[38;5;241m=\u001b[39m \u001b[43mread_manifest\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m/content/unadapted_predictions.json\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m      2\u001b[0m adapter_disabled_transcripts \u001b[38;5;241m=\u001b[39m read_manifest(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/content/adapter_disabled_predictions.json\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m orig, new \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(original_transcripts, adapter_disabled_transcripts):\n",
+            "File \u001b[0;32m/workspace/nemo/NeMo-opensource/nemo/collections/asr/parts/utils/manifest_utils.py:479\u001b[0m, in \u001b[0;36mread_manifest\u001b[0;34m(manifest)\u001b[0m\n\u001b[1;32m    477\u001b[0m     f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(manifest\u001b[38;5;241m.\u001b[39mget(), \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m, encoding\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m    478\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m:\n\u001b[0;32m--> 479\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mManifest file could not be opened: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmanifest\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    481\u001b[0m errors \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m    482\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m line \u001b[38;5;129;01min\u001b[39;00m f\u001b[38;5;241m.\u001b[39mreadlines():\n",
+            "\u001b[0;31mException\u001b[0m: Manifest file could not be opened: <class 'nemo.utils.data_utils.DataStoreObject'>: store_path=/content/unadapted_predictions.json, local_path=/content/unadapted_predictions.json"
+          ]
+        }
+      ],
       "source": [
         "original_transcripts = read_manifest('/content/unadapted_predictions.json')\n",
         "adapter_disabled_transcripts = read_manifest('/content/adapter_disabled_predictions.json')\n",
@@ -1869,7 +2968,8 @@
       "toc_visible": true
     },
     "kernelspec": {
-      "display_name": "Python 3",
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
       "name": "python3"
     },
     "language_info": {

From 9eae1a52d2b028c68197c12f032ae0272fc12ae7 Mon Sep 17 00:00:00 2001
From: kolubex <darurlakshmipathibalaji@gmail.com>
Date: Mon, 26 Aug 2024 06:50:45 -0400
Subject: [PATCH 3/4] pretrain+av+au_done

---
 balu_codes/configs/c1.yaml                    | 286 ++++++
 balu_codes/configs/c10.yaml                   | 286 ++++++
 balu_codes/configs/c11.yaml                   | 286 ++++++
 balu_codes/configs/c2.yaml                    | 286 ++++++
 balu_codes/configs/c3.yaml                    | 286 ++++++
 .../configs/c3_au_with_same_av_arch.yaml      | 309 ++++++
 balu_codes/configs/c4.yaml                    | 309 ++++++
 balu_codes/configs/c5.yaml                    | 286 ++++++
 balu_codes/configs/c6.yaml                    | 286 ++++++
 balu_codes/configs/c7.yaml                    | 286 ++++++
 balu_codes/configs/c8.yaml                    | 286 ++++++
 balu_codes/configs/c9.yaml                    | 286 ++++++
 balu_codes/infer_av_asr.py                    |  70 ++
 balu_codes/infer_test.ipynb                   | 919 ++++++++++++++++++
 balu_codes/saving_with_comments.yaml          | 309 ++++++
 balu_codes/testing_av_code.ipynb              | 674 +++++++++----
 balu_codes/train_av_asr.py                    | 135 +++
 balu_codes/train_av_asr.sh                    |  74 ++
 balu_codes/transcribe.py                      |   6 +-
 .../asr/data/audio_to_text_dataset.py         |  70 +-
 nemo/collections/asr/data/av_to_text.py       | 198 +++-
 nemo/collections/asr/metrics/av_wer.py        | 263 +++++
 nemo/collections/asr/models/__init__.py       |   1 +
 .../asr/models/av_ctc_bpe_models.py           | 338 +------
 nemo/collections/asr/models/av_ctc_models.py  | 284 +++---
 nemo/collections/asr/models/ctc_models.py     |   1 -
 .../common/parts/preprocessing/collections.py |  28 +-
 .../common/parts/preprocessing/manifest.py    | 159 +++
 28 files changed, 6270 insertions(+), 737 deletions(-)
 create mode 100644 balu_codes/configs/c1.yaml
 create mode 100644 balu_codes/configs/c10.yaml
 create mode 100644 balu_codes/configs/c11.yaml
 create mode 100644 balu_codes/configs/c2.yaml
 create mode 100644 balu_codes/configs/c3.yaml
 create mode 100644 balu_codes/configs/c3_au_with_same_av_arch.yaml
 create mode 100644 balu_codes/configs/c4.yaml
 create mode 100644 balu_codes/configs/c5.yaml
 create mode 100644 balu_codes/configs/c6.yaml
 create mode 100644 balu_codes/configs/c7.yaml
 create mode 100644 balu_codes/configs/c8.yaml
 create mode 100644 balu_codes/configs/c9.yaml
 create mode 100644 balu_codes/infer_av_asr.py
 create mode 100644 balu_codes/infer_test.ipynb
 create mode 100644 balu_codes/saving_with_comments.yaml
 create mode 100644 balu_codes/train_av_asr.py
 create mode 100644 balu_codes/train_av_asr.sh
 create mode 100644 nemo/collections/asr/metrics/av_wer.py

diff --git a/balu_codes/configs/c1.yaml b/balu_codes/configs/c1.yaml
new file mode 100644
index 000000000000..c90d9447139f
--- /dev/null
+++ b/balu_codes/configs/c1.yaml
@@ -0,0 +1,286 @@
+sample_rate: 16000
+log_prediction: true
+ctc_reduction: mean_batch
+skip_nan_grad: false
+a_model_name: BPE:stt_en_conformer_ctc_large
+labelled_manifest: false
+exp_dir: /tmp/bld56_dataset_v1/tmp/
+wandb:
+  run_name: au_pdec_uman_stok
+  project: NEMO_TEST
+  create_wandb_logger: true
+  log_model: false
+use_video_modality: false
+use_pretrained_dec: true
+train_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_train_no_label.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: false
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: true
+  num_workers: 10
+  pin_memory: true
+  use_start_end_token: false
+  trim_silence: false
+  max_duration: 20.0
+  min_duration: 0.1
+  is_tarred: false
+  tarred_audio_filepaths: null
+  shuffle_n: 2048
+  bucketing_strategy: synced_randomized
+  override_snr_ratio: 0.7
+  bucketing_batch_size:
+  - 34
+  - 30
+  - 26
+  - 22
+  - 18
+  - 16
+  - 12
+  - 8
+validation_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval_no_label.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: false
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 10
+  pin_memory: true
+  override_snr_ratio: 0.7
+  use_start_end_token: false
+test_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval_no_label.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: false
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 10
+  pin_memory: true
+  override_snr_ratio: 0.7
+  use_start_end_token: false
+tokenizer:
+  dir: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/
+  type: bpe
+  model_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/tokenizer.model
+  vocab_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/vocab.txt
+  spe_tokenizer_vocab: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/tokenizer.vocab
+preprocessor:
+  _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+  sample_rate: 16000
+  normalize: per_feature
+  window_size: 0.025
+  window_stride: 0.01
+  window: hann
+  features: 80
+  n_fft: 512
+  log: true
+  frame_splicing: 1
+  dither: 1.0e-05
+  pad_to: 0
+  pad_value: 0.0
+spec_augment:
+  _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+  freq_masks: 2
+  time_masks: 10
+  freq_width: 27
+  time_width: 0.05
+av_encoder:
+  d_model: 768
+  nhead: 8
+  num_layers: 4
+  dropout: 0.1
+v_model:
+  feat_dim: 512
+encoder:
+  _target_: nemo.collections.asr.modules.ConformerEncoder
+  feat_in: 80
+  feat_out: -1
+  n_layers: 18
+  d_model: 512
+  subsampling: striding
+  subsampling_factor: 4
+  subsampling_conv_channels: 512
+  ff_expansion_factor: 4
+  self_attention_model: rel_pos
+  n_heads: 8
+  att_context_size:
+  - -1
+  - -1
+  xscaling: true
+  untie_biases: true
+  pos_emb_max_len: 5000
+  conv_kernel_size: 31
+  conv_norm_type: batch_norm
+  dropout: 0.1
+  dropout_emb: 0.0
+  dropout_att: 0.1
+decoder:
+  _target_: nemo.collections.asr.modules.ConvASRDecoder
+  feat_in: 512
+  num_classes: 128
+  vocabulary:
+  - <unk>
+  - ▁
+  - s
+  - t
+  - e
+  - d
+  - o
+  - ▁the
+  - a
+  - i
+  - ▁a
+  - u
+  - 'y'
+  - m
+  - l
+  - 'n'
+  - p
+  - re
+  - c
+  - h
+  - r
+  - ▁s
+  - g
+  - ▁to
+  - er
+  - ing
+  - f
+  - ▁and
+  - an
+  - ▁i
+  - k
+  - ▁that
+  - ''''
+  - ▁of
+  - ▁in
+  - w
+  - ▁p
+  - ed
+  - or
+  - al
+  - ar
+  - ▁f
+  - en
+  - in
+  - b
+  - ▁you
+  - ▁w
+  - ▁b
+  - le
+  - ll
+  - es
+  - ▁it
+  - ve
+  - ur
+  - ▁we
+  - ▁re
+  - ▁be
+  - ly
+  - ▁is
+  - ▁he
+  - ▁o
+  - ▁c
+  - it
+  - ▁n
+  - ▁on
+  - un
+  - ▁t
+  - 'on'
+  - se
+  - th
+  - ce
+  - ▁do
+  - ic
+  - ▁for
+  - ▁th
+  - ion
+  - ch
+  - ▁was
+  - ri
+  - ent
+  - ▁g
+  - ver
+  - ▁co
+  - li
+  - ▁ha
+  - ▁ma
+  - la
+  - ro
+  - v
+  - us
+  - ▁ca
+  - ▁di
+  - ▁this
+  - ra
+  - ▁st
+  - ▁e
+  - ▁not
+  - ▁so
+  - ▁de
+  - ▁have
+  - ter
+  - ir
+  - ▁go
+  - ation
+  - ▁with
+  - ate
+  - ▁me
+  - ▁mo
+  - ment
+  - ▁con
+  - ▁but
+  - vi
+  - ▁pro
+  - ▁ho
+  - j
+  - ▁com
+  - ight
+  - ▁know
+  - ▁what
+  - ect
+  - ▁ex
+  - ▁some
+  - ▁would
+  - ▁like
+  - x
+  - ▁his
+  - q
+  - z
+optim:
+  name: adamw
+  lr: 0.2
+  betas:
+  - 0.9
+  - 0.98
+  weight_decay: 0.001
+  sched:
+    name: NoamAnnealing
+    d_model: 512
+    warmup_steps: 2000
+    warmup_ratio: null
+    min_lr: 1.0e-07
+compute_eval_loss: false
+adapters:
+  linear_adapter:
+    keep: false
+    name: AV_v1
+    dim: 64
+    activation: swish
+    norm_position: pre
+    dropout: 0.1
+  multi_head_attention_adapter:
+    keep: false
+  rel_position_multi_head_attention_adapter:
+    keep: false
+variational_noise:
+  start_step: 0
+  std: 0.0
+target: nemo.collections.asr.models.ctc_bpe_models.AV_EncDecCTCModelBPE
+nemo_version: 1.9.0rc0
diff --git a/balu_codes/configs/c10.yaml b/balu_codes/configs/c10.yaml
new file mode 100644
index 000000000000..2d43b50fbb79
--- /dev/null
+++ b/balu_codes/configs/c10.yaml
@@ -0,0 +1,286 @@
+sample_rate: 16000
+log_prediction: true
+ctc_reduction: mean_batch
+skip_nan_grad: false
+a_model_name: BPE:stt_en_conformer_ctc_large
+labelled_manifest: false
+exp_dir: /tmp/bld56_dataset_v1/tmp/
+wandb:
+  run_name: au_ndec_uman_ntok
+  project: NEMO_TEST
+  create_wandb_logger: true
+  log_model: false
+use_video_modality: true
+use_pretrained_dec: false
+train_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_train_no_label.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: true
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: true
+  num_workers: 10
+  pin_memory: true
+  use_start_end_token: false
+  trim_silence: false
+  max_duration: 20.0
+  min_duration: 0.1
+  is_tarred: false
+  tarred_audio_filepaths: null
+  shuffle_n: 2048
+  bucketing_strategy: synced_randomizedp
+  override_snr_ratio: 0.5
+  bucketing_batch_size:
+  - 34
+  - 30
+  - 26
+  - 22
+  - 18
+  - 16
+  - 12
+  - 8
+validation_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval_no_label.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: true
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 10
+  pin_memory: true
+  override_snr_ratio: 0.5
+  use_start_end_token: false
+test_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_test_no_label.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: true
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 10
+  pin_memory: true
+  override_snr_ratio: 0.5
+  use_start_end_token: false
+tokenizer:
+  dir: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/
+  type: bpe
+  model_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.model
+  vocab_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/vocab.txt
+  spe_tokenizer_vocab: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.vocab
+preprocessor:
+  _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+  sample_rate: 16000
+  normalize: per_feature
+  window_size: 0.025
+  window_stride: 0.01
+  window: hann
+  features: 80
+  n_fft: 512
+  log: true
+  frame_splicing: 1
+  dither: 1.0e-05
+  pad_to: 0
+  pad_value: 0.0
+spec_augment:
+  _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+  freq_masks: 2
+  time_masks: 10
+  freq_width: 27
+  time_width: 0.05
+av_encoder:
+  d_model: 512
+  nhead: 8
+  num_layers: 4
+  dropout: 0.1
+v_model:
+  feat_dim: 768
+encoder:
+  _target_: nemo.collections.asr.modules.ConformerEncoder
+  feat_in: 80
+  feat_out: -1
+  n_layers: 18
+  d_model: 512
+  subsampling: striding
+  subsampling_factor: 4
+  subsampling_conv_channels: 512
+  ff_expansion_factor: 4
+  self_attention_model: rel_pos
+  n_heads: 8
+  att_context_size:
+  - -1
+  - -1
+  xscaling: true
+  untie_biases: true
+  pos_emb_max_len: 5000
+  conv_kernel_size: 31
+  conv_norm_type: batch_norm
+  dropout: 0.1
+  dropout_emb: 0.0
+  dropout_att: 0.1
+decoder:
+  _target_: nemo.collections.asr.modules.ConvASRDecoder
+  feat_in: 512
+  num_classes: 356
+  vocabulary:
+  - <unk>
+  - ▁
+  - s
+  - t
+  - e
+  - d
+  - o
+  - ▁the
+  - a
+  - i
+  - ▁a
+  - u
+  - 'y'
+  - m
+  - l
+  - 'n'
+  - p
+  - re
+  - c
+  - h
+  - r
+  - ▁s
+  - g
+  - ▁to
+  - er
+  - ing
+  - f
+  - ▁and
+  - an
+  - ▁i
+  - k
+  - ▁that
+  - ''''
+  - ▁of
+  - ▁in
+  - w
+  - ▁p
+  - ed
+  - or
+  - al
+  - ar
+  - ▁f
+  - en
+  - in
+  - b
+  - ▁you
+  - ▁w
+  - ▁b
+  - le
+  - ll
+  - es
+  - ▁it
+  - ve
+  - ur
+  - ▁we
+  - ▁re
+  - ▁be
+  - ly
+  - ▁is
+  - ▁he
+  - ▁o
+  - ▁c
+  - it
+  - ▁n
+  - ▁on
+  - un
+  - ▁t
+  - 'on'
+  - se
+  - th
+  - ce
+  - ▁do
+  - ic
+  - ▁for
+  - ▁th
+  - ion
+  - ch
+  - ▁was
+  - ri
+  - ent
+  - ▁g
+  - ver
+  - ▁co
+  - li
+  - ▁ha
+  - ▁ma
+  - la
+  - ro
+  - v
+  - us
+  - ▁ca
+  - ▁di
+  - ▁this
+  - ra
+  - ▁st
+  - ▁e
+  - ▁not
+  - ▁so
+  - ▁de
+  - ▁have
+  - ter
+  - ir
+  - ▁go
+  - ation
+  - ▁with
+  - ate
+  - ▁me
+  - ▁mo
+  - ment
+  - ▁con
+  - ▁but
+  - vi
+  - ▁pro
+  - ▁ho
+  - j
+  - ▁com
+  - ight
+  - ▁know
+  - ▁what
+  - ect
+  - ▁ex
+  - ▁some
+  - ▁would
+  - ▁like
+  - x
+  - ▁his
+  - q
+  - z
+optim:
+  name: adamw
+  lr: 0.2
+  betas:
+  - 0.9
+  - 0.98
+  weight_decay: 0.001
+  sched:
+    name: NoamAnnealing
+    d_model: 512
+    warmup_steps: 2000
+    warmup_ratio: null
+    min_lr: 1.0e-07
+compute_eval_loss: false
+adapters:
+  linear_adapter:
+    keep: true
+    name: AV_v1
+    dim: 64
+    activation: swish
+    norm_position: pre
+    dropout: 0.1
+  multi_head_attention_adapter:
+    keep: false
+  rel_position_multi_head_attention_adapter:
+    keep: false
+variational_noise:
+  start_step: 0
+  std: 0.0
+target: nemo.collections.asr.models.ctc_bpe_models.AV_EncDecCTCModelBPE
+nemo_version: 1.9.0rc0
diff --git a/balu_codes/configs/c11.yaml b/balu_codes/configs/c11.yaml
new file mode 100644
index 000000000000..485ac0cc6fc0
--- /dev/null
+++ b/balu_codes/configs/c11.yaml
@@ -0,0 +1,286 @@
+sample_rate: 16000
+log_prediction: true
+ctc_reduction: mean_batch
+skip_nan_grad: false
+a_model_name: BPE:stt_en_conformer_ctc_large
+labelled_manifest: true
+exp_dir: /tmp/bld56_dataset_v1/tmp/
+wandb:
+  run_name: au_ndec_lman_ntok
+  project: NEMO_TEST
+  create_wandb_logger: true
+  log_model: false
+use_video_modality: true
+use_pretrained_dec: false
+train_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_train.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: true
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: true
+  num_workers: 10
+  pin_memory: true
+  use_start_end_token: false
+  trim_silence: false
+  max_duration: 20.0
+  min_duration: 0.1
+  is_tarred: false
+  tarred_audio_filepaths: null
+  shuffle_n: 2048
+  bucketing_strategy: synced_randomizedp
+  override_snr_ratio: 0.5
+  bucketing_batch_size:
+  - 34
+  - 30
+  - 26
+  - 22
+  - 18
+  - 16
+  - 12
+  - 8
+validation_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: true
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 10
+  pin_memory: true
+  override_snr_ratio: 0.5
+  use_start_end_token: false
+test_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_test.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: true
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 10
+  pin_memory: true
+  override_snr_ratio: 0.5
+  use_start_end_token: false
+tokenizer:
+  dir: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/
+  type: bpe
+  model_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.model
+  vocab_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/vocab.txt
+  spe_tokenizer_vocab: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.vocab
+preprocessor:
+  _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+  sample_rate: 16000
+  normalize: per_feature
+  window_size: 0.025
+  window_stride: 0.01
+  window: hann
+  features: 80
+  n_fft: 512
+  log: true
+  frame_splicing: 1
+  dither: 1.0e-05
+  pad_to: 0
+  pad_value: 0.0
+spec_augment:
+  _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+  freq_masks: 2
+  time_masks: 10
+  freq_width: 27
+  time_width: 0.05
+av_encoder:
+  d_model: 512
+  nhead: 8
+  num_layers: 4
+  dropout: 0.1
+v_model:
+  feat_dim: 768
+encoder:
+  _target_: nemo.collections.asr.modules.ConformerEncoder
+  feat_in: 80
+  feat_out: -1
+  n_layers: 18
+  d_model: 512
+  subsampling: striding
+  subsampling_factor: 4
+  subsampling_conv_channels: 512
+  ff_expansion_factor: 4
+  self_attention_model: rel_pos
+  n_heads: 8
+  att_context_size:
+  - -1
+  - -1
+  xscaling: true
+  untie_biases: true
+  pos_emb_max_len: 5000
+  conv_kernel_size: 31
+  conv_norm_type: batch_norm
+  dropout: 0.1
+  dropout_emb: 0.0
+  dropout_att: 0.1
+decoder:
+  _target_: nemo.collections.asr.modules.ConvASRDecoder
+  feat_in: 512
+  num_classes: 356
+  vocabulary:
+  - <unk>
+  - ▁
+  - s
+  - t
+  - e
+  - d
+  - o
+  - ▁the
+  - a
+  - i
+  - ▁a
+  - u
+  - 'y'
+  - m
+  - l
+  - 'n'
+  - p
+  - re
+  - c
+  - h
+  - r
+  - ▁s
+  - g
+  - ▁to
+  - er
+  - ing
+  - f
+  - ▁and
+  - an
+  - ▁i
+  - k
+  - ▁that
+  - ''''
+  - ▁of
+  - ▁in
+  - w
+  - ▁p
+  - ed
+  - or
+  - al
+  - ar
+  - ▁f
+  - en
+  - in
+  - b
+  - ▁you
+  - ▁w
+  - ▁b
+  - le
+  - ll
+  - es
+  - ▁it
+  - ve
+  - ur
+  - ▁we
+  - ▁re
+  - ▁be
+  - ly
+  - ▁is
+  - ▁he
+  - ▁o
+  - ▁c
+  - it
+  - ▁n
+  - ▁on
+  - un
+  - ▁t
+  - 'on'
+  - se
+  - th
+  - ce
+  - ▁do
+  - ic
+  - ▁for
+  - ▁th
+  - ion
+  - ch
+  - ▁was
+  - ri
+  - ent
+  - ▁g
+  - ver
+  - ▁co
+  - li
+  - ▁ha
+  - ▁ma
+  - la
+  - ro
+  - v
+  - us
+  - ▁ca
+  - ▁di
+  - ▁this
+  - ra
+  - ▁st
+  - ▁e
+  - ▁not
+  - ▁so
+  - ▁de
+  - ▁have
+  - ter
+  - ir
+  - ▁go
+  - ation
+  - ▁with
+  - ate
+  - ▁me
+  - ▁mo
+  - ment
+  - ▁con
+  - ▁but
+  - vi
+  - ▁pro
+  - ▁ho
+  - j
+  - ▁com
+  - ight
+  - ▁know
+  - ▁what
+  - ect
+  - ▁ex
+  - ▁some
+  - ▁would
+  - ▁like
+  - x
+  - ▁his
+  - q
+  - z
+optim:
+  name: adamw
+  lr: 0.2
+  betas:
+  - 0.9
+  - 0.98
+  weight_decay: 0.001
+  sched:
+    name: NoamAnnealing
+    d_model: 512
+    warmup_steps: 2000
+    warmup_ratio: null
+    min_lr: 1.0e-07
+compute_eval_loss: false
+adapters:
+  linear_adapter:
+    keep: true
+    name: AV_v1
+    dim: 64
+    activation: swish
+    norm_position: pre
+    dropout: 0.1
+  multi_head_attention_adapter:
+    keep: false
+  rel_position_multi_head_attention_adapter:
+    keep: false
+variational_noise:
+  start_step: 0
+  std: 0.0
+target: nemo.collections.asr.models.ctc_bpe_models.AV_EncDecCTCModelBPE
+nemo_version: 1.9.0rc0
diff --git a/balu_codes/configs/c2.yaml b/balu_codes/configs/c2.yaml
new file mode 100644
index 000000000000..6a91158d2c2c
--- /dev/null
+++ b/balu_codes/configs/c2.yaml
@@ -0,0 +1,286 @@
+sample_rate: 16000
+log_prediction: true
+ctc_reduction: mean_batch
+skip_nan_grad: false
+a_model_name: BPE:stt_en_conformer_ctc_large
+labelled_manifest: true
+exp_dir: /tmp/bld56_dataset_v1/tmp/
+wandb:
+  run_name: au_ndec_lman_ntok
+  project: NEMO_TEST
+  create_wandb_logger: true
+  log_model: false
+use_video_modality: false
+use_pretrained_dec: false
+train_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_train.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: false
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: true
+  num_workers: 10
+  pin_memory: true
+  use_start_end_token: false
+  trim_silence: false
+  max_duration: 20.0
+  min_duration: 0.1
+  is_tarred: false
+  tarred_audio_filepaths: null
+  shuffle_n: 2048
+  bucketing_strategy: synced_randomized
+  override_snr_ratio: 0.6
+  bucketing_batch_size:
+  - 34
+  - 30
+  - 26
+  - 22
+  - 18
+  - 16
+  - 12
+  - 8
+validation_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: false
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 10
+  pin_memory: true
+  override_snr_ratio: 0.6
+  use_start_end_token: false
+test_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: false
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 10
+  pin_memory: true
+  override_snr_ratio: 0.6
+  use_start_end_token: false
+tokenizer:
+  dir: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/
+  type: bpe
+  model_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.model
+  vocab_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/vocab.txt
+  spe_tokenizer_vocab: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.vocab
+preprocessor:
+  _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+  sample_rate: 16000
+  normalize: per_feature
+  window_size: 0.025
+  window_stride: 0.01
+  window: hann
+  features: 80
+  n_fft: 512
+  log: true
+  frame_splicing: 1
+  dither: 1.0e-05
+  pad_to: 0
+  pad_value: 0.0
+spec_augment:
+  _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+  freq_masks: 2
+  time_masks: 10
+  freq_width: 27
+  time_width: 0.05
+av_encoder:
+  d_model: 512
+  nhead: 8
+  num_layers: 4
+  dropout: 0.1
+v_model:
+  feat_dim: 512
+encoder:
+  _target_: nemo.collections.asr.modules.ConformerEncoder
+  feat_in: 80
+  feat_out: -1
+  n_layers: 18
+  d_model: 512
+  subsampling: striding
+  subsampling_factor: 4
+  subsampling_conv_channels: 512
+  ff_expansion_factor: 4
+  self_attention_model: rel_pos
+  n_heads: 8
+  att_context_size:
+  - -1
+  - -1
+  xscaling: true
+  untie_biases: true
+  pos_emb_max_len: 5000
+  conv_kernel_size: 31
+  conv_norm_type: batch_norm
+  dropout: 0.1
+  dropout_emb: 0.0
+  dropout_att: 0.1
+decoder:
+  _target_: nemo.collections.asr.modules.ConvASRDecoder
+  feat_in: 512
+  num_classes: 356
+  vocabulary:
+  - <unk>
+  - ▁
+  - s
+  - t
+  - e
+  - d
+  - o
+  - ▁the
+  - a
+  - i
+  - ▁a
+  - u
+  - 'y'
+  - m
+  - l
+  - 'n'
+  - p
+  - re
+  - c
+  - h
+  - r
+  - ▁s
+  - g
+  - ▁to
+  - er
+  - ing
+  - f
+  - ▁and
+  - an
+  - ▁i
+  - k
+  - ▁that
+  - ''''
+  - ▁of
+  - ▁in
+  - w
+  - ▁p
+  - ed
+  - or
+  - al
+  - ar
+  - ▁f
+  - en
+  - in
+  - b
+  - ▁you
+  - ▁w
+  - ▁b
+  - le
+  - ll
+  - es
+  - ▁it
+  - ve
+  - ur
+  - ▁we
+  - ▁re
+  - ▁be
+  - ly
+  - ▁is
+  - ▁he
+  - ▁o
+  - ▁c
+  - it
+  - ▁n
+  - ▁on
+  - un
+  - ▁t
+  - 'on'
+  - se
+  - th
+  - ce
+  - ▁do
+  - ic
+  - ▁for
+  - ▁th
+  - ion
+  - ch
+  - ▁was
+  - ri
+  - ent
+  - ▁g
+  - ver
+  - ▁co
+  - li
+  - ▁ha
+  - ▁ma
+  - la
+  - ro
+  - v
+  - us
+  - ▁ca
+  - ▁di
+  - ▁this
+  - ra
+  - ▁st
+  - ▁e
+  - ▁not
+  - ▁so
+  - ▁de
+  - ▁have
+  - ter
+  - ir
+  - ▁go
+  - ation
+  - ▁with
+  - ate
+  - ▁me
+  - ▁mo
+  - ment
+  - ▁con
+  - ▁but
+  - vi
+  - ▁pro
+  - ▁ho
+  - j
+  - ▁com
+  - ight
+  - ▁know
+  - ▁what
+  - ect
+  - ▁ex
+  - ▁some
+  - ▁would
+  - ▁like
+  - x
+  - ▁his
+  - q
+  - z
+optim:
+  name: adamw
+  lr: 0.2
+  betas:
+  - 0.9
+  - 0.98
+  weight_decay: 0.001
+  sched:
+    name: NoamAnnealing
+    d_model: 512
+    warmup_steps: 2000
+    warmup_ratio: null
+    min_lr: 1.0e-07
+compute_eval_loss: false
+adapters:
+  linear_adapter:
+    keep: true
+    name: AV_v1
+    dim: 64
+    activation: swish
+    norm_position: pre
+    dropout: 0.1
+  multi_head_attention_adapter:
+    keep: false
+  rel_position_multi_head_attention_adapter:
+    keep: false
+variational_noise:
+  start_step: 0
+  std: 0.0
+target: nemo.collections.asr.models.ctc_bpe_models.AV_EncDecCTCModelBPE
+nemo_version: 1.9.0rc0
diff --git a/balu_codes/configs/c3.yaml b/balu_codes/configs/c3.yaml
new file mode 100644
index 000000000000..0cfb64391b68
--- /dev/null
+++ b/balu_codes/configs/c3.yaml
@@ -0,0 +1,286 @@
+sample_rate: 16000
+log_prediction: true
+ctc_reduction: mean_batch
+skip_nan_grad: false
+a_model_name: BPE:stt_en_conformer_ctc_large
+labelled_manifest: true
+exp_dir: /tmp/bld56_dataset_v1/tmp/
+wandb:
+  run_name: au_ndec_uman_stok
+  project: NEMO_TEST
+  create_wandb_logger: true
+  log_model: false
+use_video_modality: false
+use_pretrained_dec: false
+train_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_train_no_label.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: false
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: true
+  num_workers: 10
+  pin_memory: true
+  use_start_end_token: false
+  trim_silence: false
+  max_duration: 20.0
+  min_duration: 0.1
+  is_tarred: false
+  tarred_audio_filepaths: null
+  shuffle_n: 2048
+  bucketing_strategy: synced_randomized
+  override_snr_ratio: 0.7
+  bucketing_batch_size:
+  - 34
+  - 30
+  - 26
+  - 22
+  - 18
+  - 16
+  - 12
+  - 8
+validation_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval_no_label.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: false
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 10
+  pin_memory: true
+  override_snr_ratio: 0.7
+  use_start_end_token: false
+test_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval_no_label.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: false
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 10
+  pin_memory: true
+  override_snr_ratio: 0.7
+  use_start_end_token: false
+tokenizer:
+  dir: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/
+  type: bpe
+  model_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/tokenizer.model
+  vocab_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/vocab.txt
+  spe_tokenizer_vocab: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/tokenizer.vocab
+preprocessor:
+  _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+  sample_rate: 16000
+  normalize: per_feature
+  window_size: 0.025
+  window_stride: 0.01
+  window: hann
+  features: 80
+  n_fft: 512
+  log: true
+  frame_splicing: 1
+  dither: 1.0e-05
+  pad_to: 0
+  pad_value: 0.0
+spec_augment:
+  _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+  freq_masks: 2
+  time_masks: 10
+  freq_width: 27
+  time_width: 0.05
+av_encoder:
+  d_model: 768
+  nhead: 8
+  num_layers: 4
+  dropout: 0.1
+v_model:
+  feat_dim: 512
+encoder:
+  _target_: nemo.collections.asr.modules.ConformerEncoder
+  feat_in: 80
+  feat_out: -1
+  n_layers: 18
+  d_model: 512
+  subsampling: striding
+  subsampling_factor: 4
+  subsampling_conv_channels: 512
+  ff_expansion_factor: 4
+  self_attention_model: rel_pos
+  n_heads: 8
+  att_context_size:
+  - -1
+  - -1
+  xscaling: true
+  untie_biases: true
+  pos_emb_max_len: 5000
+  conv_kernel_size: 31
+  conv_norm_type: batch_norm
+  dropout: 0.1
+  dropout_emb: 0.0
+  dropout_att: 0.1
+decoder:
+  _target_: nemo.collections.asr.modules.ConvASRDecoder
+  feat_in: 512
+  num_classes: 128
+  vocabulary:
+  - <unk>
+  - ▁
+  - s
+  - t
+  - e
+  - d
+  - o
+  - ▁the
+  - a
+  - i
+  - ▁a
+  - u
+  - 'y'
+  - m
+  - l
+  - 'n'
+  - p
+  - re
+  - c
+  - h
+  - r
+  - ▁s
+  - g
+  - ▁to
+  - er
+  - ing
+  - f
+  - ▁and
+  - an
+  - ▁i
+  - k
+  - ▁that
+  - ''''
+  - ▁of
+  - ▁in
+  - w
+  - ▁p
+  - ed
+  - or
+  - al
+  - ar
+  - ▁f
+  - en
+  - in
+  - b
+  - ▁you
+  - ▁w
+  - ▁b
+  - le
+  - ll
+  - es
+  - ▁it
+  - ve
+  - ur
+  - ▁we
+  - ▁re
+  - ▁be
+  - ly
+  - ▁is
+  - ▁he
+  - ▁o
+  - ▁c
+  - it
+  - ▁n
+  - ▁on
+  - un
+  - ▁t
+  - 'on'
+  - se
+  - th
+  - ce
+  - ▁do
+  - ic
+  - ▁for
+  - ▁th
+  - ion
+  - ch
+  - ▁was
+  - ri
+  - ent
+  - ▁g
+  - ver
+  - ▁co
+  - li
+  - ▁ha
+  - ▁ma
+  - la
+  - ro
+  - v
+  - us
+  - ▁ca
+  - ▁di
+  - ▁this
+  - ra
+  - ▁st
+  - ▁e
+  - ▁not
+  - ▁so
+  - ▁de
+  - ▁have
+  - ter
+  - ir
+  - ▁go
+  - ation
+  - ▁with
+  - ate
+  - ▁me
+  - ▁mo
+  - ment
+  - ▁con
+  - ▁but
+  - vi
+  - ▁pro
+  - ▁ho
+  - j
+  - ▁com
+  - ight
+  - ▁know
+  - ▁what
+  - ect
+  - ▁ex
+  - ▁some
+  - ▁would
+  - ▁like
+  - x
+  - ▁his
+  - q
+  - z
+optim:
+  name: adamw
+  lr: 0.2
+  betas:
+  - 0.9
+  - 0.98
+  weight_decay: 0.001
+  sched:
+    name: NoamAnnealing
+    d_model: 512
+    warmup_steps: 2000
+    warmup_ratio: null
+    min_lr: 1.0e-07
+compute_eval_loss: false
+adapters:
+  linear_adapter:
+    keep: true
+    name: AV_v1
+    dim: 64
+    activation: swish
+    norm_position: pre
+    dropout: 0.1
+  multi_head_attention_adapter:
+    keep: false
+  rel_position_multi_head_attention_adapter:
+    keep: false
+variational_noise:
+  start_step: 0
+  std: 0.0
+target: nemo.collections.asr.models.ctc_bpe_models.AV_EncDecCTCModelBPE
+nemo_version: 1.9.0rc0
diff --git a/balu_codes/configs/c3_au_with_same_av_arch.yaml b/balu_codes/configs/c3_au_with_same_av_arch.yaml
new file mode 100644
index 000000000000..7fdecab3df03
--- /dev/null
+++ b/balu_codes/configs/c3_au_with_same_av_arch.yaml
@@ -0,0 +1,309 @@
+sample_rate: 16000
+log_prediction: true
+ctc_reduction: mean_batch
+skip_nan_grad: false
+a_model_name: BPE:stt_en_conformer_ctc_large # CHANGE, BPE: is a must since, it is used to load audio encoder.
+labelled_manifest: true # CHANGE
+exp_dir: /tmp/bld56_dataset_v1/tmp/ # CHANGE
+wandb:
+  run_name: "au_ndec_lman_ntok_NArch_0.5" # CHANGE
+  project: "NEMO_TEST" # CHANGE
+  create_wandb_logger: true # CHANGE
+  log_model: False # CHANGE
+
+use_video_modality: true # CHANGE
+use_pretrained_dec: false # CHANGE
+train_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it1_70/annotations/manifest_train.json # CHANGE
+  video_frame_rate: 5 # CHANGE
+  get_vid_feats: true # CHANGE, always keep it to true.
+  get_zero_vid_feats: true # CHANGE
+  sample_rate: 16000
+  batch_size: 96 # CHANGE
+  shuffle: true
+  num_workers: 8 # CHANGE
+  pin_memory: true
+  use_start_end_token: false
+  trim_silence: false
+  max_duration: 20.0 # CHANGE
+  min_duration: 0.1
+  is_tarred: false # CHANGE
+  tarred_audio_filepaths: null # CHANGE
+  shuffle_n: 2048
+  bucketing_strategy: synced_randomized
+  override_snr_ratio: 0.5 # CHANGE if float, then coniders as snr, if None then goes by manifest.
+  bucketing_batch_size:
+  - 34
+  - 30
+  - 26
+  - 22
+  - 18
+  - 16
+  - 12
+  - 8
+validation_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it1_70/annotations/manifest_eval.json # CHANGE
+  video_frame_rate: 5 # CHANGE
+  get_vid_feats: true # CHANGE, always keep it to true.
+  get_zero_vid_feats: true # CHANGE
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 8
+  pin_memory: true
+  override_snr_ratio: 0.5 # CHANGE if float, then coniders as snr, if None then goes by manifest.
+  use_start_end_token: false
+test_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it1_70/annotations/manifest_eval.json # CHANGE
+  video_frame_rate: 5 # CHANGE
+  get_vid_feats: true # CHANGE, always keep it to true.
+  get_zero_vid_feats: true # CHANGE
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 8
+  pin_memory: true
+  override_snr_ratio: 0.5 # CHANGE if float, then coniders as snr, if None then goes by manifest.
+  use_start_end_token: false
+
+# NEW TOKENIZER 
+tokenizer: # CHANGE
+  dir: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/
+  type: bpe
+  model_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.model
+  vocab_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/vocab.txt
+  spe_tokenizer_vocab: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.vocab
+
+
+# OLD TOKENIZER
+# tokenizer: # CHANGE # CHANGE THE NUM CLASSES TO 128 in DEC
+#   dir: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/
+#   type: bpe
+#   model_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/tokenizer.model
+#   vocab_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/vocab.txt
+#   spe_tokenizer_vocab: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/tokenizer.vocab
+
+preprocessor:
+  _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+  sample_rate: 16000
+  normalize: per_feature
+  window_size: 0.025
+  window_stride: 0.01
+  window: hann
+  features: 80
+  n_fft: 512
+  log: true
+  frame_splicing: 1
+  dither: 1.0e-05
+  pad_to: 0
+  pad_value: 0.0
+spec_augment:
+  _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+  freq_masks: 2
+  time_masks: 10
+  freq_width: 27
+  time_width: 0.05
+
+av_encoder: # CHANGE
+  d_model: 512
+  nhead: 8
+  num_layers: 4
+  dropout: 0.1
+
+v_model: # CHANGE
+  feat_dim: 512
+
+
+encoder:
+  _target_: nemo.collections.asr.modules.ConformerEncoder
+  feat_in: 80
+  feat_out: -1
+  n_layers: 18
+  d_model: 512
+  subsampling: striding
+  subsampling_factor: 4
+  subsampling_conv_channels: 512
+  ff_expansion_factor: 4
+  self_attention_model: rel_pos
+  n_heads: 8
+  att_context_size:
+  - -1
+  - -1
+  xscaling: true
+  untie_biases: true
+  pos_emb_max_len: 5000
+  conv_kernel_size: 31
+  conv_norm_type: batch_norm
+  dropout: 0.1
+  dropout_emb: 0.0
+  dropout_att: 0.1
+decoder:
+  _target_: nemo.collections.asr.modules.ConvASRDecoder
+  feat_in: 512
+  num_classes: 356 # CHANGE to 356 for new tok, else 128.
+  vocabulary:
+  - <unk>
+  - ▁
+  - s
+  - t
+  - e
+  - d
+  - o
+  - ▁the
+  - a
+  - i
+  - ▁a
+  - u
+  - 'y'
+  - m
+  - l
+  - 'n'
+  - p
+  - re
+  - c
+  - h
+  - r
+  - ▁s
+  - g
+  - ▁to
+  - er
+  - ing
+  - f
+  - ▁and
+  - an
+  - ▁i
+  - k
+  - ▁that
+  - ''''
+  - ▁of
+  - ▁in
+  - w
+  - ▁p
+  - ed
+  - or
+  - al
+  - ar
+  - ▁f
+  - en
+  - in
+  - b
+  - ▁you
+  - ▁w
+  - ▁b
+  - le
+  - ll
+  - es
+  - ▁it
+  - ve
+  - ur
+  - ▁we
+  - ▁re
+  - ▁be
+  - ly
+  - ▁is
+  - ▁he
+  - ▁o
+  - ▁c
+  - it
+  - ▁n
+  - ▁on
+  - un
+  - ▁t
+  - 'on'
+  - se
+  - th
+  - ce
+  - ▁do
+  - ic
+  - ▁for
+  - ▁th
+  - ion
+  - ch
+  - ▁was
+  - ri
+  - ent
+  - ▁g
+  - ver
+  - ▁co
+  - li
+  - ▁ha
+  - ▁ma
+  - la
+  - ro
+  - v
+  - us
+  - ▁ca
+  - ▁di
+  - ▁this
+  - ra
+  - ▁st
+  - ▁e
+  - ▁not
+  - ▁so
+  - ▁de
+  - ▁have
+  - ter
+  - ir
+  - ▁go
+  - ation
+  - ▁with
+  - ate
+  - ▁me
+  - ▁mo
+  - ment
+  - ▁con
+  - ▁but
+  - vi
+  - ▁pro
+  - ▁ho
+  - j
+  - ▁com
+  - ight
+  - ▁know
+  - ▁what
+  - ect
+  - ▁ex
+  - ▁some
+  - ▁would
+  - ▁like
+  - x
+  - ▁his
+  - q
+  - z
+optim:
+  name: adamw
+  lr: 0.2
+  betas:
+  - 0.9
+  - 0.98
+  weight_decay: 0.001
+  sched:
+    name: NoamAnnealing
+    d_model: 512
+    warmup_steps: 2000
+    warmup_ratio: null
+    min_lr: 1.0e-07
+compute_eval_loss: false # CHANGE
+
+adapters: # CHANGE
+  linear_adapter:
+    keep: true
+    name: "AV_v1" #@param {type:"string"}
+    dim: 64 #@param {type:"integer"}
+    activation: "swish" #@param {type:"string"}
+    norm_position: "pre" #@param ["pre", "post"]
+    dropout: 0.1 #@param {type:"number"}
+  multi_head_attention_adapter: 
+    keep: false # TODO @Balu: Needs deeper understanding of config. ref: tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
+  rel_position_multi_head_attention_adapter:
+    keep: false # TODO @Balu: Needs deeper understanding of config. ref: tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
+
+  
+
+
+
+variational_noise:
+  start_step: 0
+  std: 0.0
+target: nemo.collections.asr.models.ctc_bpe_models.AV_EncDecCTCModelBPE
+nemo_version: 1.9.0rc0
diff --git a/balu_codes/configs/c4.yaml b/balu_codes/configs/c4.yaml
new file mode 100644
index 000000000000..f82a69fd51a9
--- /dev/null
+++ b/balu_codes/configs/c4.yaml
@@ -0,0 +1,309 @@
+sample_rate: 16000
+log_prediction: true
+ctc_reduction: mean_batch
+skip_nan_grad: false
+a_model_name: BPE:stt_en_conformer_ctc_large # CHANGE, BPE: is a must since, it is used to load audio encoder.
+labelled_manifest: false # CHANGE
+exp_dir: /tmp/bld56_dataset_v1/tmp/ # CHANGE
+wandb:
+  run_name: "av_ndec_uman_stok" # CHANGE
+  project: "NEMO_TEST" # CHANGE
+  create_wandb_logger: true # CHANGE
+  log_model: False # CHANGE
+
+use_video_modality: true # CHANGE
+use_pretrained_dec: false # CHANGE
+train_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it1_70/annotations/manifest_train_no_label.json # CHANGE
+  video_frame_rate: 5 # CHANGE
+  get_vid_feats: true # CHANGE, always keep it to true.
+  get_zero_vid_feats: false # CHANGE
+  sample_rate: 16000
+  batch_size: 96 # CHANGE
+  shuffle: true
+  num_workers: 8 # CHANGE
+  pin_memory: true
+  use_start_end_token: false
+  trim_silence: false
+  max_duration: 20.0 # CHANGE
+  min_duration: 0.1
+  is_tarred: false # CHANGE
+  tarred_audio_filepaths: null # CHANGE
+  shuffle_n: 2048
+  bucketing_strategy: synced_randomizedp
+  override_snr_ratio: 0.5 # CHANGE if float, then coniders as snr, if None then goes by manifest.
+  bucketing_batch_size:
+  - 34
+  - 30
+  - 26
+  - 22
+  - 18
+  - 16
+  - 12
+  - 8
+validation_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it1_70/annotations/manifest_eval_no_label.json # CHANGE
+  video_frame_rate: 5 # CHANGE
+  get_vid_feats: true # CHANGE, always keep it to true.
+  get_zero_vid_feats: false # CHANGE
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 8
+  pin_memory: true
+  override_snr_ratio: 0.5 # CHANGE if float, then coniders as snr, if None then goes by manifest.
+  use_start_end_token: false
+test_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it1_70/annotations/manifest_eval_no_label.json # CHANGE
+  video_frame_rate: 5 # CHANGE
+  get_vid_feats: true # CHANGE, always keep it to true.
+  get_zero_vid_feats: false # CHANGE
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 8
+  pin_memory: true
+  override_snr_ratio: 0.5 # CHANGE if float, then coniders as snr, if None then goes by manifest.
+  use_start_end_token: false
+
+# NEW TOKENIZER 
+# tokenizer: # CHANGE
+#   dir: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/
+#   type: bpe
+#   model_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.model
+#   vocab_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/vocab.txt
+#   spe_tokenizer_vocab: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.vocab
+
+
+# OLD TOKENIZER
+tokenizer: # CHANGE # CHANGE THE NUM CLASSES TO 128 in DEC
+  dir: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/
+  type: bpe
+  model_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/tokenizer.model
+  vocab_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/vocab.txt
+  spe_tokenizer_vocab: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/tokenizer.vocab
+
+preprocessor:
+  _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+  sample_rate: 16000
+  normalize: per_feature
+  window_size: 0.025
+  window_stride: 0.01
+  window: hann
+  features: 80
+  n_fft: 512
+  log: true
+  frame_splicing: 1
+  dither: 1.0e-05
+  pad_to: 0
+  pad_value: 0.0
+spec_augment:
+  _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+  freq_masks: 2
+  time_masks: 10
+  freq_width: 27
+  time_width: 0.05
+
+av_encoder: # CHANGE
+  d_model: 512
+  nhead: 8
+  num_layers: 4
+  dropout: 0.1
+
+v_model: # CHANGE
+  feat_dim: 512
+
+
+encoder:
+  _target_: nemo.collections.asr.modules.ConformerEncoder
+  feat_in: 80
+  feat_out: -1
+  n_layers: 18
+  d_model: 512
+  subsampling: striding
+  subsampling_factor: 4
+  subsampling_conv_channels: 512
+  ff_expansion_factor: 4
+  self_attention_model: rel_pos
+  n_heads: 8
+  att_context_size:
+  - -1
+  - -1
+  xscaling: true
+  untie_biases: true
+  pos_emb_max_len: 5000
+  conv_kernel_size: 31
+  conv_norm_type: batch_norm
+  dropout: 0.1
+  dropout_emb: 0.0
+  dropout_att: 0.1
+decoder:
+  _target_: nemo.collections.asr.modules.ConvASRDecoder
+  feat_in: 512
+  num_classes: 128 # CHANGE to 356 for new tok, else 128.
+  vocabulary:
+  - <unk>
+  - ▁
+  - s
+  - t
+  - e
+  - d
+  - o
+  - ▁the
+  - a
+  - i
+  - ▁a
+  - u
+  - 'y'
+  - m
+  - l
+  - 'n'
+  - p
+  - re
+  - c
+  - h
+  - r
+  - ▁s
+  - g
+  - ▁to
+  - er
+  - ing
+  - f
+  - ▁and
+  - an
+  - ▁i
+  - k
+  - ▁that
+  - ''''
+  - ▁of
+  - ▁in
+  - w
+  - ▁p
+  - ed
+  - or
+  - al
+  - ar
+  - ▁f
+  - en
+  - in
+  - b
+  - ▁you
+  - ▁w
+  - ▁b
+  - le
+  - ll
+  - es
+  - ▁it
+  - ve
+  - ur
+  - ▁we
+  - ▁re
+  - ▁be
+  - ly
+  - ▁is
+  - ▁he
+  - ▁o
+  - ▁c
+  - it
+  - ▁n
+  - ▁on
+  - un
+  - ▁t
+  - 'on'
+  - se
+  - th
+  - ce
+  - ▁do
+  - ic
+  - ▁for
+  - ▁th
+  - ion
+  - ch
+  - ▁was
+  - ri
+  - ent
+  - ▁g
+  - ver
+  - ▁co
+  - li
+  - ▁ha
+  - ▁ma
+  - la
+  - ro
+  - v
+  - us
+  - ▁ca
+  - ▁di
+  - ▁this
+  - ra
+  - ▁st
+  - ▁e
+  - ▁not
+  - ▁so
+  - ▁de
+  - ▁have
+  - ter
+  - ir
+  - ▁go
+  - ation
+  - ▁with
+  - ate
+  - ▁me
+  - ▁mo
+  - ment
+  - ▁con
+  - ▁but
+  - vi
+  - ▁pro
+  - ▁ho
+  - j
+  - ▁com
+  - ight
+  - ▁know
+  - ▁what
+  - ect
+  - ▁ex
+  - ▁some
+  - ▁would
+  - ▁like
+  - x
+  - ▁his
+  - q
+  - z
+optim:
+  name: adamw
+  lr: 0.2
+  betas:
+  - 0.9
+  - 0.98
+  weight_decay: 0.001
+  sched:
+    name: NoamAnnealing
+    d_model: 512
+    warmup_steps: 2000
+    warmup_ratio: null
+    min_lr: 1.0e-07
+compute_eval_loss: false # CHANGE
+
+adapters: # CHANGE
+  linear_adapter:
+    keep: true
+    name: "AV_v1" #@param {type:"string"}
+    dim: 64 #@param {type:"integer"}
+    activation: "swish" #@param {type:"string"}
+    norm_position: "pre" #@param ["pre", "post"]
+    dropout: 0.1 #@param {type:"number"}
+  multi_head_attention_adapter: 
+    keep: false # TODO @Balu: Needs deeper understanding of config. ref: tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
+  rel_position_multi_head_attention_adapter:
+    keep: false # TODO @Balu: Needs deeper understanding of config. ref: tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
+
+  
+
+
+
+variational_noise:
+  start_step: 0
+  std: 0.0
+target: nemo.collections.asr.models.ctc_bpe_models.AV_EncDecCTCModelBPE
+nemo_version: 1.9.0rc0
diff --git a/balu_codes/configs/c5.yaml b/balu_codes/configs/c5.yaml
new file mode 100644
index 000000000000..41c4f585431c
--- /dev/null
+++ b/balu_codes/configs/c5.yaml
@@ -0,0 +1,286 @@
+sample_rate: 16000
+log_prediction: true
+ctc_reduction: mean_batch
+skip_nan_grad: false
+a_model_name: BPE:stt_en_conformer_ctc_large
+labelled_manifest: true
+exp_dir: /tmp/bld56_dataset_v1/tmp/
+wandb:
+  run_name: av_ndec_lman_ntok
+  project: NEMO_TEST
+  create_wandb_logger: true
+  log_model: false
+use_video_modality: true
+use_pretrained_dec: false
+train_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_train.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: false
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: true
+  num_workers: 10
+  pin_memory: true
+  use_start_end_token: false
+  trim_silence: false
+  max_duration: 20.0
+  min_duration: 0.1
+  is_tarred: false
+  tarred_audio_filepaths: null
+  shuffle_n: 2048
+  bucketing_strategy: synced_randomized
+  override_snr_ratio: 0.5
+  bucketing_batch_size:
+  - 34
+  - 30
+  - 26
+  - 22
+  - 18
+  - 16
+  - 12
+  - 8
+validation_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: false
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 10
+  pin_memory: true
+  override_snr_ratio: 0.5
+  use_start_end_token: false
+test_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: false
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 10
+  pin_memory: true
+  override_snr_ratio: 0.5
+  use_start_end_token: false
+tokenizer:
+  dir: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/
+  type: bpe
+  model_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.model
+  vocab_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/vocab.txt
+  spe_tokenizer_vocab: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.vocab
+preprocessor:
+  _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+  sample_rate: 16000
+  normalize: per_feature
+  window_size: 0.025
+  window_stride: 0.01
+  window: hann
+  features: 80
+  n_fft: 512
+  log: true
+  frame_splicing: 1
+  dither: 1.0e-05
+  pad_to: 0
+  pad_value: 0.0
+spec_augment:
+  _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+  freq_masks: 2
+  time_masks: 10
+  freq_width: 27
+  time_width: 0.05
+av_encoder:
+  d_model: 512
+  nhead: 8
+  num_layers: 4
+  dropout: 0.1
+v_model:
+  feat_dim: 768
+encoder:
+  _target_: nemo.collections.asr.modules.ConformerEncoder
+  feat_in: 80
+  feat_out: -1
+  n_layers: 18
+  d_model: 512
+  subsampling: striding
+  subsampling_factor: 4
+  subsampling_conv_channels: 512
+  ff_expansion_factor: 4
+  self_attention_model: rel_pos
+  n_heads: 8
+  att_context_size:
+  - -1
+  - -1
+  xscaling: true
+  untie_biases: true
+  pos_emb_max_len: 5000
+  conv_kernel_size: 31
+  conv_norm_type: batch_norm
+  dropout: 0.1
+  dropout_emb: 0.0
+  dropout_att: 0.1
+decoder:
+  _target_: nemo.collections.asr.modules.ConvASRDecoder
+  feat_in: 512
+  num_classes: 356
+  vocabulary:
+  - <unk>
+  - ▁
+  - s
+  - t
+  - e
+  - d
+  - o
+  - ▁the
+  - a
+  - i
+  - ▁a
+  - u
+  - 'y'
+  - m
+  - l
+  - 'n'
+  - p
+  - re
+  - c
+  - h
+  - r
+  - ▁s
+  - g
+  - ▁to
+  - er
+  - ing
+  - f
+  - ▁and
+  - an
+  - ▁i
+  - k
+  - ▁that
+  - ''''
+  - ▁of
+  - ▁in
+  - w
+  - ▁p
+  - ed
+  - or
+  - al
+  - ar
+  - ▁f
+  - en
+  - in
+  - b
+  - ▁you
+  - ▁w
+  - ▁b
+  - le
+  - ll
+  - es
+  - ▁it
+  - ve
+  - ur
+  - ▁we
+  - ▁re
+  - ▁be
+  - ly
+  - ▁is
+  - ▁he
+  - ▁o
+  - ▁c
+  - it
+  - ▁n
+  - ▁on
+  - un
+  - ▁t
+  - 'on'
+  - se
+  - th
+  - ce
+  - ▁do
+  - ic
+  - ▁for
+  - ▁th
+  - ion
+  - ch
+  - ▁was
+  - ri
+  - ent
+  - ▁g
+  - ver
+  - ▁co
+  - li
+  - ▁ha
+  - ▁ma
+  - la
+  - ro
+  - v
+  - us
+  - ▁ca
+  - ▁di
+  - ▁this
+  - ra
+  - ▁st
+  - ▁e
+  - ▁not
+  - ▁so
+  - ▁de
+  - ▁have
+  - ter
+  - ir
+  - ▁go
+  - ation
+  - ▁with
+  - ate
+  - ▁me
+  - ▁mo
+  - ment
+  - ▁con
+  - ▁but
+  - vi
+  - ▁pro
+  - ▁ho
+  - j
+  - ▁com
+  - ight
+  - ▁know
+  - ▁what
+  - ect
+  - ▁ex
+  - ▁some
+  - ▁would
+  - ▁like
+  - x
+  - ▁his
+  - q
+  - z
+optim:
+  name: adamw
+  lr: 0.2
+  betas:
+  - 0.9
+  - 0.98
+  weight_decay: 0.001
+  sched:
+    name: NoamAnnealing
+    d_model: 512
+    warmup_steps: 2000
+    warmup_ratio: null
+    min_lr: 1.0e-07
+compute_eval_loss: false
+adapters:
+  linear_adapter:
+    keep: true
+    name: AV_v1
+    dim: 64
+    activation: swish
+    norm_position: pre
+    dropout: 0.1
+  multi_head_attention_adapter:
+    keep: false
+  rel_position_multi_head_attention_adapter:
+    keep: false
+variational_noise:
+  start_step: 0
+  std: 0.0
+target: nemo.collections.asr.models.ctc_bpe_models.AV_EncDecCTCModelBPE
+nemo_version: 1.9.0rc0
diff --git a/balu_codes/configs/c6.yaml b/balu_codes/configs/c6.yaml
new file mode 100644
index 000000000000..1f0fa02fa62d
--- /dev/null
+++ b/balu_codes/configs/c6.yaml
@@ -0,0 +1,286 @@
+sample_rate: 16000
+log_prediction: true
+ctc_reduction: mean_batch
+skip_nan_grad: false
+a_model_name: BPE:stt_en_conformer_ctc_large
+labelled_manifest: false
+exp_dir: /tmp/bld56_dataset_v1/tmp/
+wandb:
+  run_name: av_ndec_uman_stok
+  project: NEMO_TEST
+  create_wandb_logger: true
+  log_model: false
+use_video_modality: true
+use_pretrained_dec: false
+train_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_train_no_label.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: false
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: true
+  num_workers: 10
+  pin_memory: true
+  use_start_end_token: false
+  trim_silence: false
+  max_duration: 20.0
+  min_duration: 0.1
+  is_tarred: false
+  tarred_audio_filepaths: null
+  shuffle_n: 2048
+  bucketing_strategy: synced_randomizedp
+  override_snr_ratio: 0.7
+  bucketing_batch_size:
+  - 34
+  - 30
+  - 26
+  - 22
+  - 18
+  - 16
+  - 12
+  - 8
+validation_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval_no_label.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: false
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 10
+  pin_memory: true
+  override_snr_ratio: 0.7
+  use_start_end_token: false
+test_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval_no_label.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: false
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 10
+  pin_memory: true
+  override_snr_ratio: 0.7
+  use_start_end_token: false
+tokenizer:
+  dir: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/
+  type: bpe
+  model_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/tokenizer.model
+  vocab_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/vocab.txt
+  spe_tokenizer_vocab: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/tokenizer.vocab
+preprocessor:
+  _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+  sample_rate: 16000
+  normalize: per_feature
+  window_size: 0.025
+  window_stride: 0.01
+  window: hann
+  features: 80
+  n_fft: 512
+  log: true
+  frame_splicing: 1
+  dither: 1.0e-05
+  pad_to: 0
+  pad_value: 0.0
+spec_augment:
+  _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+  freq_masks: 2
+  time_masks: 10
+  freq_width: 27
+  time_width: 0.05
+av_encoder:
+  d_model: 768
+  nhead: 8
+  num_layers: 4
+  dropout: 0.1
+v_model:
+  feat_dim: 768
+encoder:
+  _target_: nemo.collections.asr.modules.ConformerEncoder
+  feat_in: 80
+  feat_out: -1
+  n_layers: 18
+  d_model: 512
+  subsampling: striding
+  subsampling_factor: 4
+  subsampling_conv_channels: 512
+  ff_expansion_factor: 4
+  self_attention_model: rel_pos
+  n_heads: 8
+  att_context_size:
+  - -1
+  - -1
+  xscaling: true
+  untie_biases: true
+  pos_emb_max_len: 5000
+  conv_kernel_size: 31
+  conv_norm_type: batch_norm
+  dropout: 0.1
+  dropout_emb: 0.0
+  dropout_att: 0.1
+decoder:
+  _target_: nemo.collections.asr.modules.ConvASRDecoder
+  feat_in: 768
+  num_classes: 128
+  vocabulary:
+  - <unk>
+  - ▁
+  - s
+  - t
+  - e
+  - d
+  - o
+  - ▁the
+  - a
+  - i
+  - ▁a
+  - u
+  - 'y'
+  - m
+  - l
+  - 'n'
+  - p
+  - re
+  - c
+  - h
+  - r
+  - ▁s
+  - g
+  - ▁to
+  - er
+  - ing
+  - f
+  - ▁and
+  - an
+  - ▁i
+  - k
+  - ▁that
+  - ''''
+  - ▁of
+  - ▁in
+  - w
+  - ▁p
+  - ed
+  - or
+  - al
+  - ar
+  - ▁f
+  - en
+  - in
+  - b
+  - ▁you
+  - ▁w
+  - ▁b
+  - le
+  - ll
+  - es
+  - ▁it
+  - ve
+  - ur
+  - ▁we
+  - ▁re
+  - ▁be
+  - ly
+  - ▁is
+  - ▁he
+  - ▁o
+  - ▁c
+  - it
+  - ▁n
+  - ▁on
+  - un
+  - ▁t
+  - 'on'
+  - se
+  - th
+  - ce
+  - ▁do
+  - ic
+  - ▁for
+  - ▁th
+  - ion
+  - ch
+  - ▁was
+  - ri
+  - ent
+  - ▁g
+  - ver
+  - ▁co
+  - li
+  - ▁ha
+  - ▁ma
+  - la
+  - ro
+  - v
+  - us
+  - ▁ca
+  - ▁di
+  - ▁this
+  - ra
+  - ▁st
+  - ▁e
+  - ▁not
+  - ▁so
+  - ▁de
+  - ▁have
+  - ter
+  - ir
+  - ▁go
+  - ation
+  - ▁with
+  - ate
+  - ▁me
+  - ▁mo
+  - ment
+  - ▁con
+  - ▁but
+  - vi
+  - ▁pro
+  - ▁ho
+  - j
+  - ▁com
+  - ight
+  - ▁know
+  - ▁what
+  - ect
+  - ▁ex
+  - ▁some
+  - ▁would
+  - ▁like
+  - x
+  - ▁his
+  - q
+  - z
+optim:
+  name: adamw
+  lr: 0.2
+  betas:
+  - 0.9
+  - 0.98
+  weight_decay: 0.001
+  sched:
+    name: NoamAnnealing
+    d_model: 512
+    warmup_steps: 2000
+    warmup_ratio: null
+    min_lr: 1.0e-07
+compute_eval_loss: false
+adapters:
+  linear_adapter:
+    keep: true
+    name: AV_v1
+    dim: 64
+    activation: swish
+    norm_position: pre
+    dropout: 0.1
+  multi_head_attention_adapter:
+    keep: false
+  rel_position_multi_head_attention_adapter:
+    keep: false
+variational_noise:
+  start_step: 0
+  std: 0.0
+target: nemo.collections.asr.models.ctc_bpe_models.AV_EncDecCTCModelBPE
+nemo_version: 1.9.0rc0
diff --git a/balu_codes/configs/c7.yaml b/balu_codes/configs/c7.yaml
new file mode 100644
index 000000000000..5590558bf4a5
--- /dev/null
+++ b/balu_codes/configs/c7.yaml
@@ -0,0 +1,286 @@
+sample_rate: 16000
+log_prediction: true
+ctc_reduction: mean_batch
+skip_nan_grad: false
+a_model_name: BPE:stt_en_conformer_ctc_large
+labelled_manifest: false
+exp_dir: /tmp/bld56_dataset_v1/tmp/
+wandb:
+  run_name: pre_av_ndec_uman_ntok
+  project: NEMO_TEST
+  create_wandb_logger: true
+  log_model: false
+use_video_modality: true
+use_pretrained_dec: false
+train_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/pretraining_train_manifest.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: true
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: true
+  num_workers: 10
+  pin_memory: true
+  use_start_end_token: false
+  trim_silence: false
+  max_duration: 20.0
+  min_duration: 0.1
+  is_tarred: false
+  tarred_audio_filepaths: null
+  shuffle_n: 2048
+  bucketing_strategy: synced_randomizedp
+  override_snr_ratio: 0.0
+  bucketing_batch_size:
+  - 34
+  - 30
+  - 26
+  - 22
+  - 18
+  - 16
+  - 12
+  - 8
+validation_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/pretraining_eval_manifest.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: true
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 10
+  pin_memory: true
+  override_snr_ratio: 0.0
+  use_start_end_token: false
+test_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/pretraining_eval_manifest.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: true
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 10
+  pin_memory: true
+  override_snr_ratio: 0.0
+  use_start_end_token: false
+tokenizer:
+  dir: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/
+  type: bpe
+  model_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.model
+  vocab_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/vocab.txt
+  spe_tokenizer_vocab: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.vocab
+preprocessor:
+  _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+  sample_rate: 16000
+  normalize: per_feature
+  window_size: 0.025
+  window_stride: 0.01
+  window: hann
+  features: 80
+  n_fft: 512
+  log: true
+  frame_splicing: 1
+  dither: 1.0e-05
+  pad_to: 0
+  pad_value: 0.0
+spec_augment:
+  _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+  freq_masks: 2
+  time_masks: 10
+  freq_width: 27
+  time_width: 0.05
+av_encoder:
+  d_model: 512
+  nhead: 8
+  num_layers: 4
+  dropout: 0.1
+v_model:
+  feat_dim: 768
+encoder:
+  _target_: nemo.collections.asr.modules.ConformerEncoder
+  feat_in: 80
+  feat_out: -1
+  n_layers: 18
+  d_model: 512
+  subsampling: striding
+  subsampling_factor: 4
+  subsampling_conv_channels: 512
+  ff_expansion_factor: 4
+  self_attention_model: rel_pos
+  n_heads: 8
+  att_context_size:
+  - -1
+  - -1
+  xscaling: true
+  untie_biases: true
+  pos_emb_max_len: 5000
+  conv_kernel_size: 31
+  conv_norm_type: batch_norm
+  dropout: 0.1
+  dropout_emb: 0.0
+  dropout_att: 0.1
+decoder:
+  _target_: nemo.collections.asr.modules.ConvASRDecoder
+  feat_in: 512
+  num_classes: 356
+  vocabulary:
+  - <unk>
+  - ▁
+  - s
+  - t
+  - e
+  - d
+  - o
+  - ▁the
+  - a
+  - i
+  - ▁a
+  - u
+  - 'y'
+  - m
+  - l
+  - 'n'
+  - p
+  - re
+  - c
+  - h
+  - r
+  - ▁s
+  - g
+  - ▁to
+  - er
+  - ing
+  - f
+  - ▁and
+  - an
+  - ▁i
+  - k
+  - ▁that
+  - ''''
+  - ▁of
+  - ▁in
+  - w
+  - ▁p
+  - ed
+  - or
+  - al
+  - ar
+  - ▁f
+  - en
+  - in
+  - b
+  - ▁you
+  - ▁w
+  - ▁b
+  - le
+  - ll
+  - es
+  - ▁it
+  - ve
+  - ur
+  - ▁we
+  - ▁re
+  - ▁be
+  - ly
+  - ▁is
+  - ▁he
+  - ▁o
+  - ▁c
+  - it
+  - ▁n
+  - ▁on
+  - un
+  - ▁t
+  - 'on'
+  - se
+  - th
+  - ce
+  - ▁do
+  - ic
+  - ▁for
+  - ▁th
+  - ion
+  - ch
+  - ▁was
+  - ri
+  - ent
+  - ▁g
+  - ver
+  - ▁co
+  - li
+  - ▁ha
+  - ▁ma
+  - la
+  - ro
+  - v
+  - us
+  - ▁ca
+  - ▁di
+  - ▁this
+  - ra
+  - ▁st
+  - ▁e
+  - ▁not
+  - ▁so
+  - ▁de
+  - ▁have
+  - ter
+  - ir
+  - ▁go
+  - ation
+  - ▁with
+  - ate
+  - ▁me
+  - ▁mo
+  - ment
+  - ▁con
+  - ▁but
+  - vi
+  - ▁pro
+  - ▁ho
+  - j
+  - ▁com
+  - ight
+  - ▁know
+  - ▁what
+  - ect
+  - ▁ex
+  - ▁some
+  - ▁would
+  - ▁like
+  - x
+  - ▁his
+  - q
+  - z
+optim:
+  name: adamw
+  lr: 0.2
+  betas:
+  - 0.9
+  - 0.98
+  weight_decay: 0.001
+  sched:
+    name: NoamAnnealing
+    d_model: 512
+    warmup_steps: 2000
+    warmup_ratio: null
+    min_lr: 1.0e-07
+compute_eval_loss: false
+adapters:
+  linear_adapter:
+    keep: true
+    name: AV_v1
+    dim: 64
+    activation: swish
+    norm_position: pre
+    dropout: 0.1
+  multi_head_attention_adapter:
+    keep: false
+  rel_position_multi_head_attention_adapter:
+    keep: false
+variational_noise:
+  start_step: 0
+  std: 0.0
+target: nemo.collections.asr.models.ctc_bpe_models.AV_EncDecCTCModelBPE
+nemo_version: 1.9.0rc0
diff --git a/balu_codes/configs/c8.yaml b/balu_codes/configs/c8.yaml
new file mode 100644
index 000000000000..bee33a706210
--- /dev/null
+++ b/balu_codes/configs/c8.yaml
@@ -0,0 +1,286 @@
+sample_rate: 16000
+log_prediction: true
+ctc_reduction: mean_batch
+skip_nan_grad: false
+a_model_name: BPE:stt_en_conformer_ctc_large
+labelled_manifest: true
+exp_dir: /tmp/bld56_dataset_v1/tmp/
+wandb:
+  run_name: av_ndec_lman_stok_fullau
+  project: NEMO_TEST
+  create_wandb_logger: true
+  log_model: false
+use_video_modality: true
+use_pretrained_dec: false
+train_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_train_no_label.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: false
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: true
+  num_workers: 10
+  pin_memory: true
+  use_start_end_token: false
+  trim_silence: false
+  max_duration: 20.0
+  min_duration: 0.1
+  is_tarred: false
+  tarred_audio_filepaths: null
+  shuffle_n: 2048
+  bucketing_strategy: synced_randomized
+  override_snr_ratio: 0.7
+  bucketing_batch_size:
+  - 34
+  - 30
+  - 26
+  - 22
+  - 18
+  - 16
+  - 12
+  - 8
+validation_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval_no_label.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: false
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 10
+  pin_memory: true
+  override_snr_ratio: 0.7
+  use_start_end_token: false
+test_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval_no_label.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: false
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 10
+  pin_memory: true
+  override_snr_ratio: 0.7
+  use_start_end_token: false
+tokenizer:
+  dir: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/
+  type: bpe
+  model_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/tokenizer.model
+  vocab_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/vocab.txt
+  spe_tokenizer_vocab: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/tokenizer.vocab
+preprocessor:
+  _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+  sample_rate: 16000
+  normalize: per_feature
+  window_size: 0.025
+  window_stride: 0.01
+  window: hann
+  features: 80
+  n_fft: 512
+  log: true
+  frame_splicing: 1
+  dither: 1.0e-05
+  pad_to: 0
+  pad_value: 0.0
+spec_augment:
+  _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+  freq_masks: 2
+  time_masks: 10
+  freq_width: 27
+  time_width: 0.05
+av_encoder:
+  d_model: 768
+  nhead: 8
+  num_layers: 4
+  dropout: 0.1
+v_model:
+  feat_dim: 768
+encoder:
+  _target_: nemo.collections.asr.modules.ConformerEncoder
+  feat_in: 80
+  feat_out: -1
+  n_layers: 18
+  d_model: 512
+  subsampling: striding
+  subsampling_factor: 4
+  subsampling_conv_channels: 512
+  ff_expansion_factor: 4
+  self_attention_model: rel_pos
+  n_heads: 8
+  att_context_size:
+  - -1
+  - -1
+  xscaling: true
+  untie_biases: true
+  pos_emb_max_len: 5000
+  conv_kernel_size: 31
+  conv_norm_type: batch_norm
+  dropout: 0.1
+  dropout_emb: 0.0
+  dropout_att: 0.1
+decoder:
+  _target_: nemo.collections.asr.modules.ConvASRDecoder
+  feat_in: 768
+  num_classes: 128
+  vocabulary:
+  - <unk>
+  - ▁
+  - s
+  - t
+  - e
+  - d
+  - o
+  - ▁the
+  - a
+  - i
+  - ▁a
+  - u
+  - 'y'
+  - m
+  - l
+  - 'n'
+  - p
+  - re
+  - c
+  - h
+  - r
+  - ▁s
+  - g
+  - ▁to
+  - er
+  - ing
+  - f
+  - ▁and
+  - an
+  - ▁i
+  - k
+  - ▁that
+  - ''''
+  - ▁of
+  - ▁in
+  - w
+  - ▁p
+  - ed
+  - or
+  - al
+  - ar
+  - ▁f
+  - en
+  - in
+  - b
+  - ▁you
+  - ▁w
+  - ▁b
+  - le
+  - ll
+  - es
+  - ▁it
+  - ve
+  - ur
+  - ▁we
+  - ▁re
+  - ▁be
+  - ly
+  - ▁is
+  - ▁he
+  - ▁o
+  - ▁c
+  - it
+  - ▁n
+  - ▁on
+  - un
+  - ▁t
+  - 'on'
+  - se
+  - th
+  - ce
+  - ▁do
+  - ic
+  - ▁for
+  - ▁th
+  - ion
+  - ch
+  - ▁was
+  - ri
+  - ent
+  - ▁g
+  - ver
+  - ▁co
+  - li
+  - ▁ha
+  - ▁ma
+  - la
+  - ro
+  - v
+  - us
+  - ▁ca
+  - ▁di
+  - ▁this
+  - ra
+  - ▁st
+  - ▁e
+  - ▁not
+  - ▁so
+  - ▁de
+  - ▁have
+  - ter
+  - ir
+  - ▁go
+  - ation
+  - ▁with
+  - ate
+  - ▁me
+  - ▁mo
+  - ment
+  - ▁con
+  - ▁but
+  - vi
+  - ▁pro
+  - ▁ho
+  - j
+  - ▁com
+  - ight
+  - ▁know
+  - ▁what
+  - ect
+  - ▁ex
+  - ▁some
+  - ▁would
+  - ▁like
+  - x
+  - ▁his
+  - q
+  - z
+optim:
+  name: adamw
+  lr: 0.2
+  betas:
+  - 0.9
+  - 0.98
+  weight_decay: 0.001
+  sched:
+    name: NoamAnnealing
+    d_model: 512
+    warmup_steps: 2000
+    warmup_ratio: null
+    min_lr: 1.0e-07
+compute_eval_loss: false
+adapters:
+  linear_adapter:
+    keep: false
+    name: AV_v1
+    dim: 64
+    activation: swish
+    norm_position: pre
+    dropout: 0.1
+  multi_head_attention_adapter:
+    keep: false
+  rel_position_multi_head_attention_adapter:
+    keep: false
+variational_noise:
+  start_step: 0
+  std: 0.0
+target: nemo.collections.asr.models.ctc_bpe_models.AV_EncDecCTCModelBPE
+nemo_version: 1.9.0rc0
diff --git a/balu_codes/configs/c9.yaml b/balu_codes/configs/c9.yaml
new file mode 100644
index 000000000000..5de581c9429b
--- /dev/null
+++ b/balu_codes/configs/c9.yaml
@@ -0,0 +1,286 @@
+sample_rate: 16000
+log_prediction: true
+ctc_reduction: mean_batch
+skip_nan_grad: false
+a_model_name: BPE:stt_en_conformer_ctc_large
+labelled_manifest: false
+exp_dir: /tmp/bld56_dataset_v1/tmp/
+wandb:
+  run_name: av_ndec_uman_ntok
+  project: NEMO_TEST
+  create_wandb_logger: true
+  log_model: false
+use_video_modality: true
+use_pretrained_dec: false
+train_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_train_no_label.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: false
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: true
+  num_workers: 10
+  pin_memory: true
+  use_start_end_token: false
+  trim_silence: false
+  max_duration: 20.0
+  min_duration: 0.1
+  is_tarred: false
+  tarred_audio_filepaths: null
+  shuffle_n: 2048
+  bucketing_strategy: synced_randomizedp
+  override_snr_ratio: 0.5
+  bucketing_batch_size:
+  - 34
+  - 30
+  - 26
+  - 22
+  - 18
+  - 16
+  - 12
+  - 8
+validation_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval_no_label.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: false
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 10
+  pin_memory: true
+  override_snr_ratio: 0.5
+  use_start_end_token: false
+test_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval_no_label.json
+  video_frame_rate: 5
+  get_vid_feats: true
+  get_zero_vid_feats: false
+  sample_rate: 16000
+  batch_size: 96
+  shuffle: false
+  num_workers: 10
+  pin_memory: true
+  override_snr_ratio: 0.5
+  use_start_end_token: false
+tokenizer:
+  dir: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/
+  type: bpe
+  model_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.model
+  vocab_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/vocab.txt
+  spe_tokenizer_vocab: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.vocab
+preprocessor:
+  _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+  sample_rate: 16000
+  normalize: per_feature
+  window_size: 0.025
+  window_stride: 0.01
+  window: hann
+  features: 80
+  n_fft: 512
+  log: true
+  frame_splicing: 1
+  dither: 1.0e-05
+  pad_to: 0
+  pad_value: 0.0
+spec_augment:
+  _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+  freq_masks: 2
+  time_masks: 10
+  freq_width: 27
+  time_width: 0.05
+av_encoder:
+  d_model: 512
+  nhead: 8
+  num_layers: 4
+  dropout: 0.1
+v_model:
+  feat_dim: 768
+encoder:
+  _target_: nemo.collections.asr.modules.ConformerEncoder
+  feat_in: 80
+  feat_out: -1
+  n_layers: 18
+  d_model: 512
+  subsampling: striding
+  subsampling_factor: 4
+  subsampling_conv_channels: 512
+  ff_expansion_factor: 4
+  self_attention_model: rel_pos
+  n_heads: 8
+  att_context_size:
+  - -1
+  - -1
+  xscaling: true
+  untie_biases: true
+  pos_emb_max_len: 5000
+  conv_kernel_size: 31
+  conv_norm_type: batch_norm
+  dropout: 0.1
+  dropout_emb: 0.0
+  dropout_att: 0.1
+decoder:
+  _target_: nemo.collections.asr.modules.ConvASRDecoder
+  feat_in: 512
+  num_classes: 356
+  vocabulary:
+  - <unk>
+  - ▁
+  - s
+  - t
+  - e
+  - d
+  - o
+  - ▁the
+  - a
+  - i
+  - ▁a
+  - u
+  - 'y'
+  - m
+  - l
+  - 'n'
+  - p
+  - re
+  - c
+  - h
+  - r
+  - ▁s
+  - g
+  - ▁to
+  - er
+  - ing
+  - f
+  - ▁and
+  - an
+  - ▁i
+  - k
+  - ▁that
+  - ''''
+  - ▁of
+  - ▁in
+  - w
+  - ▁p
+  - ed
+  - or
+  - al
+  - ar
+  - ▁f
+  - en
+  - in
+  - b
+  - ▁you
+  - ▁w
+  - ▁b
+  - le
+  - ll
+  - es
+  - ▁it
+  - ve
+  - ur
+  - ▁we
+  - ▁re
+  - ▁be
+  - ly
+  - ▁is
+  - ▁he
+  - ▁o
+  - ▁c
+  - it
+  - ▁n
+  - ▁on
+  - un
+  - ▁t
+  - 'on'
+  - se
+  - th
+  - ce
+  - ▁do
+  - ic
+  - ▁for
+  - ▁th
+  - ion
+  - ch
+  - ▁was
+  - ri
+  - ent
+  - ▁g
+  - ver
+  - ▁co
+  - li
+  - ▁ha
+  - ▁ma
+  - la
+  - ro
+  - v
+  - us
+  - ▁ca
+  - ▁di
+  - ▁this
+  - ra
+  - ▁st
+  - ▁e
+  - ▁not
+  - ▁so
+  - ▁de
+  - ▁have
+  - ter
+  - ir
+  - ▁go
+  - ation
+  - ▁with
+  - ate
+  - ▁me
+  - ▁mo
+  - ment
+  - ▁con
+  - ▁but
+  - vi
+  - ▁pro
+  - ▁ho
+  - j
+  - ▁com
+  - ight
+  - ▁know
+  - ▁what
+  - ect
+  - ▁ex
+  - ▁some
+  - ▁would
+  - ▁like
+  - x
+  - ▁his
+  - q
+  - z
+optim:
+  name: adamw
+  lr: 0.2
+  betas:
+  - 0.9
+  - 0.98
+  weight_decay: 0.001
+  sched:
+    name: NoamAnnealing
+    d_model: 512
+    warmup_steps: 2000
+    warmup_ratio: null
+    min_lr: 1.0e-07
+compute_eval_loss: false
+adapters:
+  linear_adapter:
+    keep: true
+    name: AV_v1
+    dim: 64
+    activation: swish
+    norm_position: pre
+    dropout: 0.1
+  multi_head_attention_adapter:
+    keep: false
+  rel_position_multi_head_attention_adapter:
+    keep: false
+variational_noise:
+  start_step: 0
+  std: 0.0
+target: nemo.collections.asr.models.ctc_bpe_models.AV_EncDecCTCModelBPE
+nemo_version: 1.9.0rc0
diff --git a/balu_codes/infer_av_asr.py b/balu_codes/infer_av_asr.py
new file mode 100644
index 000000000000..7e95b165f40c
--- /dev/null
+++ b/balu_codes/infer_av_asr.py
@@ -0,0 +1,70 @@
+import os
+import sys
+sys.path.insert(0, os.path.abspath('/home/bld56/gsoc/nemo/NeMo-opensource/'))
+import nemo.collections.asr as nemo_asr
+from omegaconf import OmegaConf
+import torch
+import json
+
+# Function to load the model from a .nemo file
+def load_model(nemo_file_path):
+    model = nemo_asr.models.AV_EncDecCTCModelBPE.restore_from(nemo_file_path)
+    model.eval()
+    return model
+
+# Function to perform inference on a single sample
+def infer_single_sample(model, sample):
+    # Prepare input data
+    audio_file = sample['audio_filepath']
+    video_file = sample['video_filepath']
+    feature_file = sample['feature_file']
+    duration = sample['duration']
+    
+    # Perform inference
+    transcription = model.transcribe(
+        audio=[audio_file],
+        return_hypotheses = True,
+        override_duration = duration,
+    )
+    
+    return transcription[0]
+
+# Function to run inference on a manifest file
+def run_inference(manifest_file_path, nemo_file_path, output_file_path):
+    # Load the model
+    model = load_model(nemo_file_path)
+    
+    # Read the manifest file
+    with open(manifest_file_path, 'r') as f:
+        manifest_data = [json.loads(line.strip()) for line in f]
+    
+    # Run inference on each sample in the manifest
+    results = []
+    for sample in manifest_data:
+        transcription = infer_single_sample(model, sample)
+        result = {
+            'audio_filepath': sample['audio_filepath'],
+            'video_filepath': sample['video_filepath'],
+            'feature_file': sample['feature_file'],
+            'duration': sample['duration'],
+            'transcription': transcription
+        }
+        results.append(result)
+    
+    # Save the results to the output file
+    with open(output_file_path, 'w') as f:
+        for result in results:
+            f.write(json.dumps(result) + '\n')
+
+    print(f"Inference completed. Results saved to {output_file_path}")
+
+# Main function
+def main():
+    manifest_file_path = '/tmp/bld56_dataset_v1/it2/annotations/manifest_eval.json'  # Path to your input manifest file
+    nemo_file_path = '/tmp/bld56_dataset_v1/tmp/av_ndec_lman_ntok_0.5/2024-08-16_11-16-34/checkpoints/av_ndec_lman_ntok_0.5.nemo'  # Path to your trained .nemo file
+    output_file_path = 'temp.json'  # Path to save the inference results
+    
+    run_inference(manifest_file_path, nemo_file_path, output_file_path)
+
+if __name__ == "__main__":
+    main()
diff --git a/balu_codes/infer_test.ipynb b/balu_codes/infer_test.ipynb
new file mode 100644
index 000000000000..253e57438f4e
--- /dev/null
+++ b/balu_codes/infer_test.ipynb
@@ -0,0 +1,919 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## For valdation with given model path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "sys.path.insert(0, os.path.abspath('/home/bld56/gsoc/nemo/NeMo-opensource/'))\n",
+    "import nemo.core as nemo_core\n",
+    "from nemo.core import adapter_mixins\n",
+    "from nemo.utils import exp_manager\n",
+    "import nemo.collections.asr as nemo_asr\n",
+    "import nemo\n",
+    "import json\n",
+    "from omegaconf import OmegaConf, open_dict\n",
+    "import torch\n",
+    "from pytorch_lightning import Trainer\n",
+    "from lightning.pytorch.loggers import WandbLogger\n",
+    "from torchmetrics.text import WordErrorRate\n",
+    "import warnings\n",
+    "import argparse"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_and_configure_model(config_file_path):\n",
+    "    conf = OmegaConf.load(config_file_path)\n",
+    "    overrides = OmegaConf.from_cli()\n",
+    "    updated_conf = OmegaConf.merge(conf, overrides)\n",
+    "    OmegaConf.set_struct(updated_conf, True)\n",
+    "    model = nemo_asr.models.AV_EncDecCTCModelBPE(updated_conf)\n",
+    "\n",
+    "    model.setup_training_data(model.cfg.train_ds)\n",
+    "    return model, conf\n",
+    "\n",
+    "# Function to freeze and unfreeze model parameters based on adapters\n",
+    "def manage_model_adapters(model, conf):\n",
+    "    # Freeze the entire model\n",
+    "    model.freeze()\n",
+    "    \n",
+    "    # Determine which modules to train based on configuration\n",
+    "    if model.cfg.use_video_modality:\n",
+    "        modules_to_train = [\n",
+    "            model.a_linear, model.v_linear, model.av_encoder, model.av_enocder_layer, \n",
+    "            model.a_modal_embs, model.v_modal_embs, model.decoder, model.a_pos_enc, model.v_pos_enc\n",
+    "        ]\n",
+    "    elif not model.cfg.use_video_modality and model.cfg.use_pretrained_dec:\n",
+    "        modules_to_train = [model.a_model.decoder]\n",
+    "    else:  # not model.cfg.use_video_modality and not model.cfg.use_pretrained_dec\n",
+    "        modules_to_train = [model.decoder]\n",
+    "    \n",
+    "    # Set the selected modules to training mode and enable gradients\n",
+    "    for module in modules_to_train:\n",
+    "        module.train()\n",
+    "        for param in module.parameters():\n",
+    "            param.requires_grad = True\n",
+    "\n",
+    "    # Handle adapter configurations if needed\n",
+    "    if conf.adapters.linear_adapter.keep:\n",
+    "        model.a_model.freeze()\n",
+    "        model.a_model.set_enabled_adapters(enabled=False)\n",
+    "        model.a_model.set_enabled_adapters(name=conf.adapters.linear_adapter.name, enabled=True)\n",
+    "        model.a_model.unfreeze_enabled_adapters()\n",
+    "    else:\n",
+    "        model.a_model.unfreeze()\n",
+    "\n",
+    "# Function to set up the trainer\n",
+    "def setup_trainer():\n",
+    "    torch.set_float32_matmul_precision('high')\n",
+    "    accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n",
+    "    trainer = Trainer(\n",
+    "        devices=1, accelerator=accelerator, \n",
+    "        # strategy=\"ddp_find_unused_parameters_true\",\n",
+    "        # strategy=\"ddp_notebook\",\n",
+    "        max_epochs=100,\n",
+    "        enable_checkpointing=False, logger=False,\n",
+    "        log_every_n_steps=5, check_val_every_n_epoch=1,\n",
+    "    )\n",
+    "    return trainer\n",
+    "\n",
+    "# Function to set up experiment manager\n",
+    "def setup_exp_manager(trainer, model):\n",
+    "    os.environ.pop('NEMO_EXPM_VERSION', None)\n",
+    "\n",
+    "    exp_config = exp_manager.ExpManagerConfig(\n",
+    "        exp_dir=model.cfg.exp_dir,\n",
+    "        name=f'{model.cfg.wandb.run_name}',\n",
+    "        checkpoint_callback_params=exp_manager.CallbackParams(\n",
+    "            monitor=\"val_u_wer\",\n",
+    "            mode=\"min\",\n",
+    "            always_save_nemo=True,\n",
+    "            save_best_model=True,\n",
+    "        ),\n",
+    "        create_wandb_logger=model.cfg.wandb.create_wandb_logger,\n",
+    "        wandb_logger_kwargs=OmegaConf.create({\"project\": f\"{model.cfg.wandb.project}\", \"name\": f\"{model.cfg.wandb.run_name}_{model.cfg.train_ds.override_snr_ratio}\", \"log_model\": model.cfg.wandb.log_model}),\n",
+    "    )\n",
+    "\n",
+    "    exp_config = OmegaConf.structured(exp_config)\n",
+    "    logdir = exp_manager.exp_manager(trainer, exp_config)\n",
+    "    if model.cfg.wandb.create_wandb_logger:\n",
+    "        trainer.loggers[1].log_hyperparams(OmegaConf.to_container(model.cfg)) # wandb logger\n",
+    "        # log the manifest file to wandb server\n",
+    "        trainer.loggers[1].experiment.log_artifact(f\"{model.cfg.train_ds.manifest_filepath}\")\n",
+    "        trainer.loggers[1].experiment.log_artifact(f\"{model.cfg.validation_ds.manifest_filepath}\")\n",
+    "        \n",
+    "    return logdir\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-08-25 15:44:38 mixins:172] Tokenizer SentencePieceTokenizer initialized with 128 tokens\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[NeMo W 2024-08-25 15:44:38 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
+      "    Train config : \n",
+      "    manifest_filepath:\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/tarred_audio_manifest.json\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket2/tarred_audio_manifest.json\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket3/tarred_audio_manifest.json\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket4/tarred_audio_manifest.json\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket5/tarred_audio_manifest.json\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket6/tarred_audio_manifest.json\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket7/tarred_audio_manifest.json\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket8/tarred_audio_manifest.json\n",
+      "    sample_rate: 16000\n",
+      "    batch_size: 1\n",
+      "    shuffle: true\n",
+      "    num_workers: 4\n",
+      "    pin_memory: true\n",
+      "    use_start_end_token: false\n",
+      "    trim_silence: false\n",
+      "    max_duration: 20.0\n",
+      "    min_duration: 0.1\n",
+      "    is_tarred: true\n",
+      "    tarred_audio_filepaths:\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/audio__OP_0..8191_CL_.tar\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket2/audio__OP_0..8191_CL_.tar\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket3/audio__OP_0..8191_CL_.tar\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket4/audio__OP_0..8191_CL_.tar\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket5/audio__OP_0..8191_CL_.tar\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket6/audio__OP_0..8191_CL_.tar\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket7/audio__OP_0..8191_CL_.tar\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket8/audio__OP_0..8191_CL_.tar\n",
+      "    shuffle_n: 2048\n",
+      "    bucketing_strategy: synced_randomized\n",
+      "    bucketing_batch_size:\n",
+      "    - 34\n",
+      "    - 30\n",
+      "    - 26\n",
+      "    - 22\n",
+      "    - 18\n",
+      "    - 16\n",
+      "    - 12\n",
+      "    - 8\n",
+      "    \n",
+      "[NeMo W 2024-08-25 15:44:38 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
+      "    Validation config : \n",
+      "    manifest_filepath:\n",
+      "    - /manifests/librispeech/librivox-dev-other.json\n",
+      "    - /manifests/librispeech/librivox-dev-clean.json\n",
+      "    - /manifests/librispeech/librivox-test-other.json\n",
+      "    - /manifests/librispeech/librivox-test-clean.json\n",
+      "    sample_rate: 16000\n",
+      "    batch_size: 32\n",
+      "    shuffle: false\n",
+      "    num_workers: 8\n",
+      "    pin_memory: true\n",
+      "    use_start_end_token: false\n",
+      "    \n",
+      "[NeMo W 2024-08-25 15:44:38 modelPT:189] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a valid configuration file to setup the test data loader(s).\n",
+      "    Test config : \n",
+      "    manifest_filepath:\n",
+      "    - /manifests/librispeech/librivox-dev-other.json\n",
+      "    - /manifests/librispeech/librivox-dev-clean.json\n",
+      "    - /manifests/librispeech/librivox-test-other.json\n",
+      "    - /manifests/librispeech/librivox-test-clean.json\n",
+      "    sample_rate: 16000\n",
+      "    batch_size: 32\n",
+      "    shuffle: false\n",
+      "    num_workers: 8\n",
+      "    pin_memory: true\n",
+      "    use_start_end_token: false\n",
+      "    \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-08-25 15:44:38 features:305] PADDING: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-08-25 15:44:39 save_restore_connector:263] Model EncDecCTCModelBPE was successfully restored from /home/bld56/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.\n",
+      "EncDecCTCModelBPE(\n",
+      "  (preprocessor): AudioToMelSpectrogramPreprocessor(\n",
+      "    (featurizer): FilterbankFeatures()\n",
+      "  )\n",
+      "  (encoder): ConformerEncoder(\n",
+      "    (pre_encode): ConvSubsampling(\n",
+      "      (out): Linear(in_features=10240, out_features=512, bias=True)\n",
+      "      (conv): Sequential(\n",
+      "        (0): Conv2d(1, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n",
+      "        (1): ReLU(inplace=True)\n",
+      "        (2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n",
+      "        (3): ReLU(inplace=True)\n",
+      "      )\n",
+      "    )\n",
+      "    (pos_enc): RelPositionalEncoding(\n",
+      "      (dropout): Dropout(p=0.1, inplace=False)\n",
+      "    )\n",
+      "    (layers): ModuleList(\n",
+      "      (0-17): 18 x ConformerLayer(\n",
+      "        (norm_feed_forward1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
+      "        (feed_forward1): ConformerFeedForward(\n",
+      "          (linear1): Linear(in_features=512, out_features=2048, bias=True)\n",
+      "          (activation): Swish()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (linear2): Linear(in_features=2048, out_features=512, bias=True)\n",
+      "        )\n",
+      "        (norm_conv): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
+      "        (conv): ConformerConvolution(\n",
+      "          (pointwise_conv1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))\n",
+      "          (depthwise_conv): CausalConv1D(512, 512, kernel_size=(31,), stride=(1,), groups=512)\n",
+      "          (batch_norm): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+      "          (activation): Swish()\n",
+      "          (pointwise_conv2): Conv1d(512, 512, kernel_size=(1,), stride=(1,))\n",
+      "        )\n",
+      "        (norm_self_att): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
+      "        (self_attn): RelPositionMultiHeadAttention(\n",
+      "          (linear_q): Linear(in_features=512, out_features=512, bias=True)\n",
+      "          (linear_k): Linear(in_features=512, out_features=512, bias=True)\n",
+      "          (linear_v): Linear(in_features=512, out_features=512, bias=True)\n",
+      "          (linear_out): Linear(in_features=512, out_features=512, bias=True)\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (linear_pos): Linear(in_features=512, out_features=512, bias=False)\n",
+      "        )\n",
+      "        (norm_feed_forward2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
+      "        (feed_forward2): ConformerFeedForward(\n",
+      "          (linear1): Linear(in_features=512, out_features=2048, bias=True)\n",
+      "          (activation): Swish()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (linear2): Linear(in_features=2048, out_features=512, bias=True)\n",
+      "        )\n",
+      "        (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        (norm_out): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
+      "      )\n",
+      "    )\n",
+      "  )\n",
+      "  (decoder): ConvASRDecoder(\n",
+      "    (decoder_layers): Sequential(\n",
+      "      (0): Conv1d(512, 129, kernel_size=(1,), stride=(1,))\n",
+      "    )\n",
+      "  )\n",
+      "  (loss): CTCLoss()\n",
+      "  (spec_augmentation): SpectrogramAugmentation(\n",
+      "    (spec_augment): SpecAugment()\n",
+      "  )\n",
+      "  (wer): WER()\n",
+      ")\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Main function to execute the workflow\n",
+    "# def main(config_file_path, args):\n",
+    "# config_file_path = '/home/bld56/gsoc/nemo/NeMo-opensource/balu_codes/configs/c1.yaml'\n",
+    "# model, conf = load_and_configure_model(config_file_path)\n",
+    "# ckpt_path = f\"/tmp/bld56_dataset_v1/saved_models/pre_av_ndec_uman_ntok--val_u_wer=0.0809-epoch=11.ckpt\"\n",
+    "ckpt_path = f\"/home/bld56/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo\"\n",
+    "model = nemo_asr.models.AV_EncDecCTCModelBPE.restore_from(ckpt_path, override_config_path=None) \n",
+    "model.cfg.train_ds.manifest_filepath = '/tmp/bld56_dataset_v1/it2/annotations/manifest_train_no_label.json'\n",
+    "model.cfg.validation_ds.manifest_filepath = '/tmp/bld56_dataset_v1/it2/annotations/manifest_eval_no_label.json'\n",
+    "model.cfg.test_ds.manifest_filepath = '/tmp/bld56_dataset_v1/it2/annotations/manifest_test_no_label.json'\n",
+    "print(model)\n",
+    "# model.cfg.wandb.run_name += 'pre+'\n",
+    "# manage_model_adapters(model, conf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[NeMo W 2024-08-25 15:44:39 nemo_logging:349] /home/bld56/.miniconda3/envs/nemo/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/bld56/.miniconda3/envs/nemo/lib/python3.10/sit ...\n",
+      "    \n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "HPU available: False, using: 0 HPUs\n"
+     ]
+    }
+   ],
+   "source": [
+    "trainer = setup_trainer()\n",
+    "model.set_trainer(trainer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[NeMo W 2024-08-25 15:46:29 nemo_logging:349] /home/bld56/.miniconda3/envs/nemo/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/bld56/.miniconda3/envs/nemo/lib/python3.10/sit ...\n",
+      "    \n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]\n",
+      "[NeMo W 2024-08-25 15:46:29 nemo_logging:349] /home/bld56/.miniconda3/envs/nemo/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:105: Total length of `list` across ranks is zero. Please make sure this was your intention.\n",
+      "    \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer.validate(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## From Aug 16 Weekly meet to develop to transcribe fucniton"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "sys.path.insert(0, os.path.abspath('/home/bld56/gsoc/nemo/NeMo-opensource/'))\n",
+    "import nemo.collections.asr as nemo_asr\n",
+    "import json\n",
+    "import nemo.collections.asr.data.av_to_text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Function to load the model from a .nemo file\n",
+    "def load_model(nemo_file_path):\n",
+    "    model = nemo_asr.models.AV_EncDecCTCModelBPE.restore_from(nemo_file_path)\n",
+    "    model.eval()\n",
+    "    return model\n",
+    "\n",
+    "# Function to perform inference on a single sample\n",
+    "def infer_single_sample(model, sample):\n",
+    "    # Prepare input data\n",
+    "    audio_file = sample['audio_filepath']\n",
+    "    video_file = sample['video_filepath']\n",
+    "    feature_file = sample['feature_file']\n",
+    "    duration = sample['duration']\n",
+    "    \n",
+    "    # Perform inference\n",
+    "    transcription = model.transcribe(\n",
+    "        audio=[audio_file],\n",
+    "        return_hypotheses = True,\n",
+    "        override_duration = duration,\n",
+    "    )\n",
+    "    \n",
+    "    return transcription[0]\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sentencepiece as spm\n",
+    "\n",
+    "# Load the tokenizer model from the specified path\n",
+    "def load_tokenizer(tokenizer_model_path):\n",
+    "    tokenizer = spm.SentencePieceProcessor()\n",
+    "    tokenizer.load(tokenizer_model_path)\n",
+    "    return tokenizer\n",
+    "\n",
+    "# tokenizer_path = \"/home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.model\"\n",
+    "# for i in range(self.tokenizer.vocab_size):\n",
+    "#         piece = self.tokenizer.ids_to_tokens([i])\n",
+    "#         piece = piece[0]\n",
+    "#         vocabulary[piece] = i + 1\n",
+    "# tokenizer = load_tokenizer(tokenizer_path)\n",
+    "config = {}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-08-17 12:36:32 mixins:172] Tokenizer SentencePieceTokenizer initialized with 356 tokens\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[NeMo W 2024-08-17 12:36:32 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
+      "    Train config : \n",
+      "    manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_train.json\n",
+      "    video_frame_rate: 5\n",
+      "    get_vid_feats: true\n",
+      "    get_zero_vid_feats: false\n",
+      "    sample_rate: 16000\n",
+      "    batch_size: 32\n",
+      "    shuffle: true\n",
+      "    num_workers: 11\n",
+      "    pin_memory: true\n",
+      "    use_start_end_token: false\n",
+      "    trim_silence: false\n",
+      "    max_duration: 20.0\n",
+      "    min_duration: 0.1\n",
+      "    is_tarred: false\n",
+      "    tarred_audio_filepaths: null\n",
+      "    shuffle_n: 2048\n",
+      "    bucketing_strategy: synced_randomized\n",
+      "    override_snr_ratio: 0.7\n",
+      "    bucketing_batch_size:\n",
+      "    - 34\n",
+      "    - 30\n",
+      "    - 26\n",
+      "    - 22\n",
+      "    - 18\n",
+      "    - 16\n",
+      "    - 12\n",
+      "    - 8\n",
+      "    \n",
+      "[NeMo W 2024-08-17 12:36:32 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
+      "    Validation config : \n",
+      "    manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval.json\n",
+      "    video_frame_rate: 5\n",
+      "    get_vid_feats: true\n",
+      "    get_zero_vid_feats: false\n",
+      "    sample_rate: 16000\n",
+      "    batch_size: 32\n",
+      "    shuffle: false\n",
+      "    num_workers: 11\n",
+      "    pin_memory: true\n",
+      "    override_snr_ratio: 0.7\n",
+      "    use_start_end_token: false\n",
+      "    \n",
+      "[NeMo W 2024-08-17 12:36:32 modelPT:189] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a valid configuration file to setup the test data loader(s).\n",
+      "    Test config : \n",
+      "    manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval.json\n",
+      "    video_frame_rate: 5\n",
+      "    get_vid_feats: true\n",
+      "    get_zero_vid_feats: false\n",
+      "    sample_rate: 16000\n",
+      "    batch_size: 32\n",
+      "    shuffle: false\n",
+      "    num_workers: 11\n",
+      "    pin_memory: true\n",
+      "    override_snr_ratio: 0.7\n",
+      "    use_start_end_token: false\n",
+      "    \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-08-17 12:36:32 cloud:58] Found existing object /home/bld56/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.\n",
+      "[NeMo I 2024-08-17 12:36:32 cloud:64] Re-using file from: /home/bld56/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo\n",
+      "[NeMo I 2024-08-17 12:36:32 common:815] Instantiating model from pre-trained checkpoint\n",
+      "Updated encoder _target_ model : nemo.collections.asr.modules.conformer_encoder.ConformerEncoderAdapter\n",
+      "[NeMo I 2024-08-17 12:36:32 cloud:58] Found existing object /home/bld56/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.\n",
+      "[NeMo I 2024-08-17 12:36:32 cloud:64] Re-using file from: /home/bld56/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo\n",
+      "[NeMo I 2024-08-17 12:36:32 common:815] Instantiating model from pre-trained checkpoint\n",
+      "[NeMo I 2024-08-17 12:36:33 mixins:172] Tokenizer SentencePieceTokenizer initialized with 128 tokens\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[NeMo W 2024-08-17 12:36:33 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
+      "    Train config : \n",
+      "    manifest_filepath:\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/tarred_audio_manifest.json\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket2/tarred_audio_manifest.json\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket3/tarred_audio_manifest.json\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket4/tarred_audio_manifest.json\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket5/tarred_audio_manifest.json\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket6/tarred_audio_manifest.json\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket7/tarred_audio_manifest.json\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket8/tarred_audio_manifest.json\n",
+      "    sample_rate: 16000\n",
+      "    batch_size: 1\n",
+      "    shuffle: true\n",
+      "    num_workers: 4\n",
+      "    pin_memory: true\n",
+      "    use_start_end_token: false\n",
+      "    trim_silence: false\n",
+      "    max_duration: 20.0\n",
+      "    min_duration: 0.1\n",
+      "    is_tarred: true\n",
+      "    tarred_audio_filepaths:\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/audio__OP_0..8191_CL_.tar\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket2/audio__OP_0..8191_CL_.tar\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket3/audio__OP_0..8191_CL_.tar\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket4/audio__OP_0..8191_CL_.tar\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket5/audio__OP_0..8191_CL_.tar\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket6/audio__OP_0..8191_CL_.tar\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket7/audio__OP_0..8191_CL_.tar\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket8/audio__OP_0..8191_CL_.tar\n",
+      "    shuffle_n: 2048\n",
+      "    bucketing_strategy: synced_randomized\n",
+      "    bucketing_batch_size:\n",
+      "    - 34\n",
+      "    - 30\n",
+      "    - 26\n",
+      "    - 22\n",
+      "    - 18\n",
+      "    - 16\n",
+      "    - 12\n",
+      "    - 8\n",
+      "    \n",
+      "[NeMo W 2024-08-17 12:36:33 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
+      "    Validation config : \n",
+      "    manifest_filepath:\n",
+      "    - /manifests/librispeech/librivox-dev-other.json\n",
+      "    - /manifests/librispeech/librivox-dev-clean.json\n",
+      "    - /manifests/librispeech/librivox-test-other.json\n",
+      "    - /manifests/librispeech/librivox-test-clean.json\n",
+      "    sample_rate: 16000\n",
+      "    batch_size: 32\n",
+      "    shuffle: false\n",
+      "    num_workers: 8\n",
+      "    pin_memory: true\n",
+      "    use_start_end_token: false\n",
+      "    \n",
+      "[NeMo W 2024-08-17 12:36:33 modelPT:189] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a valid configuration file to setup the test data loader(s).\n",
+      "    Test config : \n",
+      "    manifest_filepath:\n",
+      "    - /manifests/librispeech/librivox-dev-other.json\n",
+      "    - /manifests/librispeech/librivox-dev-clean.json\n",
+      "    - /manifests/librispeech/librivox-test-other.json\n",
+      "    - /manifests/librispeech/librivox-test-clean.json\n",
+      "    sample_rate: 16000\n",
+      "    batch_size: 32\n",
+      "    shuffle: false\n",
+      "    num_workers: 8\n",
+      "    pin_memory: true\n",
+      "    use_start_end_token: false\n",
+      "    \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-08-17 12:36:33 features:305] PADDING: 0\n",
+      "[NeMo I 2024-08-17 12:36:34 save_restore_connector:263] Model EncDecCTCModelBPE was successfully restored from /home/bld56/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.\n",
+      "[NeMo I 2024-08-17 12:36:34 save_restore_connector:263] Model AV_EncDecCTCModelBPE was successfully restored from /tmp/bld56_dataset_v1/tmp/av_ndec_lman_ntok_0.5/2024-08-16_11-16-34/checkpoints/av_ndec_lman_ntok_0.5.nemo.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "AV_EncDecCTCModelBPE(\n",
+       "  (a_model): EncDecCTCModelBPE(\n",
+       "    (preprocessor): AudioToMelSpectrogramPreprocessor(\n",
+       "      (featurizer): FilterbankFeatures()\n",
+       "    )\n",
+       "    (encoder): ConformerEncoderAdapter(\n",
+       "      (pre_encode): ConvSubsampling(\n",
+       "        (out): Linear(in_features=10240, out_features=512, bias=True)\n",
+       "        (conv): Sequential(\n",
+       "          (0): Conv2d(1, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n",
+       "          (1): ReLU(inplace=True)\n",
+       "          (2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n",
+       "          (3): ReLU(inplace=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (pos_enc): RelPositionalEncoding(\n",
+       "        (dropout): Dropout(p=0.1, inplace=False)\n",
+       "      )\n",
+       "      (layers): ModuleList(\n",
+       "        (0-17): 18 x ConformerLayer(\n",
+       "          (norm_feed_forward1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
+       "          (feed_forward1): ConformerFeedForward(\n",
+       "            (linear1): Linear(in_features=512, out_features=2048, bias=True)\n",
+       "            (activation): Swish()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (linear2): Linear(in_features=2048, out_features=512, bias=True)\n",
+       "          )\n",
+       "          (norm_conv): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
+       "          (conv): ConformerConvolution(\n",
+       "            (pointwise_conv1): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))\n",
+       "            (depthwise_conv): CausalConv1D(512, 512, kernel_size=(31,), stride=(1,), groups=512)\n",
+       "            (batch_norm): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
+       "            (activation): Swish()\n",
+       "            (pointwise_conv2): Conv1d(512, 512, kernel_size=(1,), stride=(1,))\n",
+       "          )\n",
+       "          (norm_self_att): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
+       "          (self_attn): RelPositionMultiHeadAttention(\n",
+       "            (linear_q): Linear(in_features=512, out_features=512, bias=True)\n",
+       "            (linear_k): Linear(in_features=512, out_features=512, bias=True)\n",
+       "            (linear_v): Linear(in_features=512, out_features=512, bias=True)\n",
+       "            (linear_out): Linear(in_features=512, out_features=512, bias=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (linear_pos): Linear(in_features=512, out_features=512, bias=False)\n",
+       "          )\n",
+       "          (norm_feed_forward2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
+       "          (feed_forward2): ConformerFeedForward(\n",
+       "            (linear1): Linear(in_features=512, out_features=2048, bias=True)\n",
+       "            (activation): Swish()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (linear2): Linear(in_features=2048, out_features=512, bias=True)\n",
+       "          )\n",
+       "          (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          (norm_out): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
+       "          (adapter_layer): ModuleDict(\n",
+       "            (AV_v1): LinearAdapter(\n",
+       "              (module): Sequential(\n",
+       "                (0): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
+       "                (1): Linear(in_features=512, out_features=64, bias=False)\n",
+       "                (2): SiLU(inplace=True)\n",
+       "                (3): Linear(in_features=64, out_features=512, bias=False)\n",
+       "              )\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "    (decoder): ConvASRDecoder(\n",
+       "      (decoder_layers): Sequential(\n",
+       "        (0): Conv1d(512, 129, kernel_size=(1,), stride=(1,))\n",
+       "      )\n",
+       "    )\n",
+       "    (loss): CTCLoss()\n",
+       "    (spec_augmentation): SpectrogramAugmentation(\n",
+       "      (spec_augment): SpecAugment()\n",
+       "    )\n",
+       "    (wer): WER()\n",
+       "  )\n",
+       "  (a_linear): Linear(in_features=512, out_features=512, bias=True)\n",
+       "  (v_linear): Linear(in_features=768, out_features=512, bias=True)\n",
+       "  (av_enocder_layer): TransformerEncoderLayer(\n",
+       "    (self_attn): MultiheadAttention(\n",
+       "      (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)\n",
+       "    )\n",
+       "    (linear1): Linear(in_features=512, out_features=2048, bias=True)\n",
+       "    (dropout): Dropout(p=0.1, inplace=False)\n",
+       "    (linear2): Linear(in_features=2048, out_features=512, bias=True)\n",
+       "    (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
+       "    (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
+       "    (dropout1): Dropout(p=0.1, inplace=False)\n",
+       "    (dropout2): Dropout(p=0.1, inplace=False)\n",
+       "  )\n",
+       "  (av_encoder): TransformerEncoder(\n",
+       "    (layers): ModuleList(\n",
+       "      (0-3): 4 x TransformerEncoderLayer(\n",
+       "        (self_attn): MultiheadAttention(\n",
+       "          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)\n",
+       "        )\n",
+       "        (linear1): Linear(in_features=512, out_features=2048, bias=True)\n",
+       "        (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        (linear2): Linear(in_features=2048, out_features=512, bias=True)\n",
+       "        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
+       "        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
+       "        (dropout1): Dropout(p=0.1, inplace=False)\n",
+       "        (dropout2): Dropout(p=0.1, inplace=False)\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       "  (a_modal_embs): Embedding(1, 512)\n",
+       "  (v_modal_embs): Embedding(1, 512)\n",
+       "  (a_pos_enc): Embedding(10000, 512)\n",
+       "  (v_pos_enc): Embedding(10000, 512)\n",
+       "  (decoder): ConvASRDecoder(\n",
+       "    (decoder_layers): Sequential(\n",
+       "      (0): Conv1d(512, 357, kernel_size=(1,), stride=(1,))\n",
+       "    )\n",
+       "  )\n",
+       "  (loss): CTCLoss()\n",
+       "  (wer): AV_WER()\n",
+       ")"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "manifest_file_path = '/tmp/bld56_dataset_v1/it2/annotations/manifest_eval.json'  # Path to your input manifest file\n",
+    "nemo_file_path = '/tmp/bld56_dataset_v1/tmp/av_ndec_lman_ntok_0.5/2024-08-16_11-16-34/checkpoints/av_ndec_lman_ntok_0.5.nemo'  # Path to your trained .nemo file\n",
+    "output_file_path = 'temp.json'  # Path to save the inference results\n",
+    "model = load_model(nemo_file_path)\n",
+    "model.to('cpu')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-08-17 12:36:34 collections:321] Dataset loaded with 2200 files totalling 6.11 hours\n",
+      "[NeMo I 2024-08-17 12:36:34 collections:323] 0 files were filtered totalling 0.00 hours\n"
+     ]
+    }
+   ],
+   "source": [
+    "dataset = nemo.collections.asr.data.av_to_text.AVToBPEDataset(\n",
+    "        manifest_filepath='/tmp/bld56_dataset_v1/it2/annotations/manifest_eval.json',\n",
+    "        tokenizer= model.tokenizer,\n",
+    "        sample_rate= 16000,\n",
+    "        int_values=config.get('int_values', False),\n",
+    "        max_duration=config.get('max_duration', None),\n",
+    "        min_duration=config.get('min_duration', None),\n",
+    "        max_utts=config.get('max_utts', 0),\n",
+    "        trim=config.get('trim_silence', False),\n",
+    "        use_start_end_token=config.get('use_start_end_token', True),\n",
+    "        return_sample_id=config.get('return_sample_id', False),\n",
+    "        channel_selector=config.get('channel_selector', None),\n",
+    "        video_frame_rate=config.get('video_frame_rate', 5),\n",
+    "        get_vid_feats=config.get('get_vid_feats', True),\n",
+    "        get_zero_vid_feats = config.get('get_zero_vid_feats', False),\n",
+    "        override_snr_ratio = config.get('override_snr_ratio', None),\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "batch_size = 1\n",
+    "dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, collate_fn=dataset.collate_fn)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "so regular and complete a part of normal everyday living that finding newspapers on the news then buying them morning and night was takenso regular and complete a part of normal everyday living that finding newspapers on the news then buying them morning and night was taken <N11>\n",
+      "\n",
+      "\n",
+      "<N228>+<N228>+<N228>+<N228>+<N228>+<N228>+ re reguular and and com compleletee a p parart of of n normmalal eververyy dayay l liivving that that f findding n nwssppaapperss on on the neewssstandnds b buyying the them m mororninging and and n niightt<N228>+ wasas t taakinging<N228>+<N228>+<N228>+rereggullar and and compleletee a a parart of of norormmal e eververy d dayay l liivving that that f findinging n nwssppaapperss on the the neewssstandnds b buyyinging themm m mornning and and<N228>+ n nighght<N228>+ wasas t t takk<N228>+ing<N228>+ <N36>ararararllararararararararlararararararararararararararararararararararlararararararararlararar\n"
+     ]
+    }
+   ],
+   "source": [
+    "signal, signal_len, video_input_signal, transcript, transcript_len = dataloader.__iter__().__next__()\n",
+    "log_probs, encoded_len, predictions = model.forward(audio_input_signal=signal, audio_input_signal_length=signal_len, video_input_signal=video_input_signal)\n",
+    "loss_value = model.loss(\n",
+    "            log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len\n",
+    "        )\n",
+    "# print(transcript, predictions)\n",
+    "# tokenizer = load_tokenizer('/home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_toknizer/tokenizer.model')\n",
+    "# model.wer.decoding.decode_tokens_to_str(predictions[0].cpu().numpy().tolist())\n",
+    "# replace predictions[0] where 356 to 355\n",
+    "predictions[0][predictions[0] == 356] = 355\n",
+    "print(model.wer.decoding.decode_tokens_to_str(transcript[0].cpu().numpy().tolist()))\n",
+    "print('\\n')\n",
+    "print(model.wer.decoding.decode_tokens_to_str(predictions[0].cpu().numpy().tolist()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<N228>+<N228>+<N228>+<N228>+<N228>+<N228>+ re reguular and and com compleletee a p parart of of n normmalal eververyy dayay l liivving that that f findding n nwssppaapperss on on the neewssstandnds b buyying the them m mororninging and and n niightt<N228>+ wasas t taakinging<N228>+<N228>+<N228>+rereggullar and and compleletee a a parart of of norormmal e eververy d dayay l liivving that that f findinging n nwssppaapperss on the the neewssstandnds b buyyinging themm m mornning and and<N228>+ n nighght<N228>+ wasas t t takk<N228>+ing<N228>+ ararararllararararararararlararararararararararararararararararararararlararararararararlararar\n"
+     ]
+    }
+   ],
+   "source": [
+    "import re\n",
+    "temp_str = model.wer.decoding.decode_tokens_to_str(predictions[0].cpu().numpy().tolist())\n",
+    "r_tags = re.findall(r'<N\\d+>', temp_str)\n",
+    "for tag in r_tags:\n",
+    "    unlabelled_h = temp_str.replace(tag, '')\n",
+    "print(unlabelled_h)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Function to run inference on a manifest file\n",
+    "def run_inference(manifest_file_path, nemo_file_path, output_file_path):\n",
+    "    # Load the model\n",
+    "    model = load_model(nemo_file_path)\n",
+    "    \n",
+    "    # Read the manifest file\n",
+    "    with open(manifest_file_path, 'r') as f:\n",
+    "        manifest_data = [json.loads(line.strip()) for line in f]\n",
+    "    \n",
+    "    # Run inference on each sample in the manifest\n",
+    "    results = []\n",
+    "    for sample in manifest_data:\n",
+    "        transcription = infer_single_sample(model, sample)\n",
+    "        result = {\n",
+    "            'audio_filepath': sample['audio_filepath'],\n",
+    "            'video_filepath': sample['video_filepath'],\n",
+    "            'feature_file': sample['feature_file'],\n",
+    "            'duration': sample['duration'],\n",
+    "            'transcription': transcription\n",
+    "        }\n",
+    "        results.append(result)\n",
+    "    \n",
+    "    # Save the results to the output file\n",
+    "    with open(output_file_path, 'w') as f:\n",
+    "        for result in results:\n",
+    "            f.write(json.dumps(result) + '\\n')\n",
+    "\n",
+    "    print(f\"Inference completed. Results saved to {output_file_path}\")\n",
+    "\n",
+    "# Main function\n",
+    "def main():\n",
+    "    manifest_file_path = '/tmp/bld56_dataset_v1/it2/annotations/manifest_eval.json'  # Path to your input manifest file\n",
+    "    nemo_file_path = '/tmp/bld56_dataset_v1/tmp/av_ndec_lman_ntok_0.5/2024-08-16_11-16-34/checkpoints/av_ndec_lman_ntok_0.5.nemo'  # Path to your trained .nemo file\n",
+    "    output_file_path = 'temp.json'  # Path to save the inference results\n",
+    "    \n",
+    "    run_inference(manifest_file_path, nemo_file_path, output_file_path)\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    main()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "nemo",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/balu_codes/saving_with_comments.yaml b/balu_codes/saving_with_comments.yaml
new file mode 100644
index 000000000000..b72034e77dd2
--- /dev/null
+++ b/balu_codes/saving_with_comments.yaml
@@ -0,0 +1,309 @@
+sample_rate: 16000
+log_prediction: true
+ctc_reduction: mean_batch
+skip_nan_grad: false
+a_model_name: BPE:stt_en_conformer_ctc_large # CHANGE, BPE: is a must since, it is used to load audio encoder.
+labelled_manifest: false # CHANGE
+exp_dir: /tmp/bld56_dataset_v1/tmp/ # CHANGE
+wandb:
+  run_name: "av_ndec_uman_stok" # CHANGE
+  project: "NEMO_TEST" # CHANGE
+  create_wandb_logger: true # CHANGE
+  log_model: False # CHANGE
+
+use_video_modality: true # CHANGE
+use_pretrained_dec: false # CHANGE
+train_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it1_70/annotations/manifest_train_no_label.json # CHANGE
+  video_frame_rate: 5 # CHANGE
+  get_vid_feats: true # CHANGE, always keep it to true.
+  get_zero_vid_feats: false # CHANGE
+  sample_rate: 16000
+  batch_size: 64 # CHANGE
+  shuffle: true
+  num_workers: 8 # CHANGE
+  pin_memory: true
+  use_start_end_token: false
+  trim_silence: false
+  max_duration: 20.0 # CHANGE
+  min_duration: 0.1
+  is_tarred: false # CHANGE
+  tarred_audio_filepaths: null # CHANGE
+  shuffle_n: 2048
+  bucketing_strategy: synced_randomizedp
+  override_snr_ratio: 0.5 # CHANGE if float, then coniders as snr, if None then goes by manifest.
+  bucketing_batch_size:
+  - 34
+  - 30
+  - 26
+  - 22
+  - 18
+  - 16
+  - 12
+  - 8
+validation_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it1_70/annotations/manifest_eval_no_label.json # CHANGE
+  video_frame_rate: 5 # CHANGE
+  get_vid_feats: true # CHANGE, always keep it to true.
+  get_zero_vid_feats: false # CHANGE
+  sample_rate: 16000
+  batch_size: 64
+  shuffle: false
+  num_workers: 8
+  pin_memory: true
+  override_snr_ratio: 0.5 # CHANGE if float, then coniders as snr, if None then goes by manifest.
+  use_start_end_token: false
+test_ds:
+  manifest_filepath: /tmp/bld56_dataset_v1/it1_70/annotations/manifest_eval_no_label.json # CHANGE
+  video_frame_rate: 5 # CHANGE
+  get_vid_feats: true # CHANGE, always keep it to true.
+  get_zero_vid_feats: false # CHANGE
+  sample_rate: 16000
+  batch_size: 64
+  shuffle: false
+  num_workers: 8
+  pin_memory: true
+  override_snr_ratio: 0.5 # CHANGE if float, then coniders as snr, if None then goes by manifest.
+  use_start_end_token: false
+
+# NEW TOKENIZER 
+# tokenizer: # CHANGE
+#   dir: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/
+#   type: bpe
+#   model_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.model
+#   vocab_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/vocab.txt
+#   spe_tokenizer_vocab: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.vocab
+
+
+# OLD TOKENIZER
+tokenizer: # CHANGE # CHANGE THE NUM CLASSES TO 128 in DEC
+  dir: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/
+  type: bpe
+  model_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/tokenizer.model
+  vocab_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/vocab.txt
+  spe_tokenizer_vocab: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/tokenizer.vocab
+
+preprocessor:
+  _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+  sample_rate: 16000
+  normalize: per_feature
+  window_size: 0.025
+  window_stride: 0.01
+  window: hann
+  features: 80
+  n_fft: 512
+  log: true
+  frame_splicing: 1
+  dither: 1.0e-05
+  pad_to: 0
+  pad_value: 0.0
+spec_augment:
+  _target_: nemo.collections.asr.modules.SpectrogramAugmentation
+  freq_masks: 2
+  time_masks: 10
+  freq_width: 27
+  time_width: 0.05
+
+av_encoder: # CHANGE
+  d_model: 512
+  nhead: 8
+  num_layers: 4
+  dropout: 0.1
+
+v_model: # CHANGE
+  feat_dim: 512
+
+
+encoder:
+  _target_: nemo.collections.asr.modules.ConformerEncoder
+  feat_in: 80
+  feat_out: -1
+  n_layers: 18
+  d_model: 512
+  subsampling: striding
+  subsampling_factor: 4
+  subsampling_conv_channels: 512
+  ff_expansion_factor: 4
+  self_attention_model: rel_pos
+  n_heads: 8
+  att_context_size:
+  - -1
+  - -1
+  xscaling: true
+  untie_biases: true
+  pos_emb_max_len: 5000
+  conv_kernel_size: 31
+  conv_norm_type: batch_norm
+  dropout: 0.1
+  dropout_emb: 0.0
+  dropout_att: 0.1
+decoder:
+  _target_: nemo.collections.asr.modules.ConvASRDecoder
+  feat_in: 512
+  num_classes: 128 # CHANGE to 356 for new tok, else 128.
+  vocabulary:
+  - <unk>
+  - ▁
+  - s
+  - t
+  - e
+  - d
+  - o
+  - ▁the
+  - a
+  - i
+  - ▁a
+  - u
+  - 'y'
+  - m
+  - l
+  - 'n'
+  - p
+  - re
+  - c
+  - h
+  - r
+  - ▁s
+  - g
+  - ▁to
+  - er
+  - ing
+  - f
+  - ▁and
+  - an
+  - ▁i
+  - k
+  - ▁that
+  - ''''
+  - ▁of
+  - ▁in
+  - w
+  - ▁p
+  - ed
+  - or
+  - al
+  - ar
+  - ▁f
+  - en
+  - in
+  - b
+  - ▁you
+  - ▁w
+  - ▁b
+  - le
+  - ll
+  - es
+  - ▁it
+  - ve
+  - ur
+  - ▁we
+  - ▁re
+  - ▁be
+  - ly
+  - ▁is
+  - ▁he
+  - ▁o
+  - ▁c
+  - it
+  - ▁n
+  - ▁on
+  - un
+  - ▁t
+  - 'on'
+  - se
+  - th
+  - ce
+  - ▁do
+  - ic
+  - ▁for
+  - ▁th
+  - ion
+  - ch
+  - ▁was
+  - ri
+  - ent
+  - ▁g
+  - ver
+  - ▁co
+  - li
+  - ▁ha
+  - ▁ma
+  - la
+  - ro
+  - v
+  - us
+  - ▁ca
+  - ▁di
+  - ▁this
+  - ra
+  - ▁st
+  - ▁e
+  - ▁not
+  - ▁so
+  - ▁de
+  - ▁have
+  - ter
+  - ir
+  - ▁go
+  - ation
+  - ▁with
+  - ate
+  - ▁me
+  - ▁mo
+  - ment
+  - ▁con
+  - ▁but
+  - vi
+  - ▁pro
+  - ▁ho
+  - j
+  - ▁com
+  - ight
+  - ▁know
+  - ▁what
+  - ect
+  - ▁ex
+  - ▁some
+  - ▁would
+  - ▁like
+  - x
+  - ▁his
+  - q
+  - z
+optim:
+  name: adamw
+  lr: 0.2
+  betas:
+  - 0.9
+  - 0.98
+  weight_decay: 0.001
+  sched:
+    name: NoamAnnealing
+    d_model: 512
+    warmup_steps: 2000
+    warmup_ratio: null
+    min_lr: 1.0e-07
+compute_eval_loss: false # CHANGE
+
+adapters: # CHANGE
+  linear_adapter:
+    keep: true
+    name: "AV_v1" #@param {type:"string"}
+    dim: 64 #@param {type:"integer"}
+    activation: "swish" #@param {type:"string"}
+    norm_position: "pre" #@param ["pre", "post"]
+    dropout: 0.1 #@param {type:"number"}
+  multi_head_attention_adapter: 
+    keep: false # TODO @Balu: Needs deeper understanding of config. ref: tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
+  rel_position_multi_head_attention_adapter:
+    keep: false # TODO @Balu: Needs deeper understanding of config. ref: tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
+
+  
+
+
+
+variational_noise:
+  start_step: 0
+  std: 0.0
+target: nemo.collections.asr.models.ctc_bpe_models.AV_EncDecCTCModelBPE
+nemo_version: 1.9.0rc0
diff --git a/balu_codes/testing_av_code.ipynb b/balu_codes/testing_av_code.ipynb
index a3a52c6b0971..f37945b9f2cc 100644
--- a/balu_codes/testing_av_code.ipynb
+++ b/balu_codes/testing_av_code.ipynb
@@ -9,99 +9,116 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/workspace/nemo/NeMo-opensource/nemo/collections/asr/__init__.py\n",
-      "/workspace/nemo/NeMo-opensource/nemo/core/__init__.py\n",
-      "/workspace/nemo/NeMo-opensource/nemo/__init__.py\n",
-      "/usr/local/lib/python3.10/dist-packages/lightning/__init__.py\n"
+      "<module 'nemo' from '/home/bld56/gsoc/nemo/NeMo-opensource/nemo/__init__.py'>\n"
      ]
     }
    ],
    "source": [
     "import os\n",
     "import sys\n",
-    "\n",
-    "\n",
-    "# Insert local paths at the beginning of sys.path\n",
-    "sys.path.insert(0, os.path.abspath('/workspace/nemo/NeMo-opensource/'))\n",
-    "\n",
-    "import nemo.collections.asr as nemo_asr\n",
-    "print(nemo_asr.__file__)\n",
+    "sys.path.insert(0, os.path.abspath('/home/bld56/gsoc/nemo/NeMo-opensource'))\n",
+    "import nemo\n",
+    "print(nemo)\n",
     "import nemo.core as nemo_core\n",
-    "print(nemo_core.__file__)\n",
     "from nemo.core import adapter_mixins\n",
+    "from nemo.utils import exp_manager\n",
+    "import nemo.collections.asr as nemo_asr\n",
     "import nemo\n",
-    "print(nemo.__file__)\n",
-    "import lightning\n",
-    "print(lightning.__file__)\n",
-    "# Restore the site-packages paths\n",
-    "# sys.path.extend(site_packages_paths)\n",
-    "\n",
-    "import torch\n",
+    "import json\n",
     "from omegaconf import OmegaConf, open_dict\n",
+    "import torch\n",
     "from pytorch_lightning import Trainer\n",
     "from lightning.pytorch.loggers import WandbLogger\n",
-    "wandb_logger = WandbLogger(project=\"NEMO_TEST\")\n",
-    "# import nemo.collections.asr as nemo_asr"
+    "from torchmetrics.text import WordErrorRate\n",
+    "import warnings\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[NeMo I 2024-07-08 23:55:14 collections:317] Dataset loaded with 18351 files totalling 50.98 hours\n",
-      "[NeMo I 2024-07-08 23:55:14 collections:319] 0 files were filtered totalling 0.00 hours\n",
-      "[NeMo I 2024-07-08 23:55:14 collections:317] Dataset loaded with 18351 files totalling 50.98 hours\n",
-      "[NeMo I 2024-07-08 23:55:14 collections:319] 0 files were filtered totalling 0.00 hours\n",
-      "[NeMo I 2024-07-08 23:55:15 collections:317] Dataset loaded with 18351 files totalling 50.98 hours\n",
-      "[NeMo I 2024-07-08 23:55:15 collections:319] 0 files were filtered totalling 0.00 hours\n",
-      "[NeMo I 2024-07-08 23:55:15 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_2.0.0rc0/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo.\n",
-      "[NeMo I 2024-07-08 23:55:15 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_2.0.0rc0/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo\n",
-      "[NeMo I 2024-07-08 23:55:15 common:815] Instantiating model from pre-trained checkpoint\n",
-      "[NeMo I 2024-07-08 23:55:16 features:305] PADDING: 16\n",
-      "[NeMo I 2024-07-08 23:55:17 save_restore_connector:263] Model EncDecCTCModel was successfully restored from /root/.cache/torch/NeMo/NeMo_2.0.0rc0/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "TRAIN_MANIFEST = \"/disk1/it1/annotations/manifest_train.json\"\n",
-    "TEST_MANIFEST = \"/disk1/it1/annotations/manifest_train.json\"\n",
-    "override_config_file_path = \"/workspace/nemo/NeMo-opensource/balu_codes/ctc_model_QuartzNet15x5Base copy.yaml\"\n",
-    "conf = OmegaConf.load(override_config_file_path)\n",
-    "OmegaConf.set_struct(conf, True)\n",
-    "model = nemo_asr.models.AV_EncDecCTCModel(conf)"
+    "# Function to load and configure the model\n",
+    "def load_and_configure_model(config_file_path):\n",
+    "    conf = OmegaConf.load(config_file_path)\n",
+    "    overrides = OmegaConf.from_cli()\n",
+    "    updated_conf = OmegaConf.merge(conf, overrides)\n",
+    "    OmegaConf.set_struct(updated_conf, True)\n",
+    "    model = nemo_asr.models.AV_EncDecCTCModelBPE(updated_conf)\n",
+    "\n",
+    "    model.setup_training_data(model.cfg.train_ds)\n",
+    "    return model, conf\n",
+    "\n",
+    "# Function to freeze and unfreeze model parameters based on adapters\n",
+    "def manage_model_adapters(model, conf):\n",
+    "    # Freeze the entire model\n",
+    "    model.freeze()\n",
+    "    \n",
+    "    # Determine which modules to train based on configuration\n",
+    "    if model.cfg.use_video_modality:\n",
+    "        modules_to_train = [\n",
+    "            model.a_linear, model.v_linear, model.av_encoder, model.av_enocder_layer, \n",
+    "            model.a_modal_embs, model.v_modal_embs, model.decoder\n",
+    "        ]\n",
+    "    elif not model.cfg.use_video_modality and model.cfg.use_pretrained_dec:\n",
+    "        modules_to_train = [model.a_model.decoder]\n",
+    "    else:  # not model.cfg.use_video_modality and not model.cfg.use_pretrained_dec\n",
+    "        modules_to_train = [model.decoder]\n",
+    "    \n",
+    "    # Set the selected modules to training mode and enable gradients\n",
+    "    for module in modules_to_train:\n",
+    "        module.train()\n",
+    "        for param in module.parameters():\n",
+    "            param.requires_grad = True\n",
+    "\n",
+    "    # Handle adapter configurations if needed\n",
+    "    if conf.adapters.linear_adapter.keep:\n",
+    "        model.a_model.freeze()\n",
+    "        model.a_model.set_enabled_adapters(enabled=False)\n",
+    "        model.a_model.set_enabled_adapters(name=conf.adapters.linear_adapter.name, enabled=True)\n",
+    "        model.a_model.unfreeze_enabled_adapters()\n",
+    "\n",
+    "\n",
+    "# Function to set up the trainer\n",
+    "def setup_trainer():\n",
+    "    accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n",
+    "    trainer = Trainer(\n",
+    "        devices=1, accelerator=accelerator, max_epochs=100,\n",
+    "        enable_checkpointing=False, logger=False,\n",
+    "        log_every_n_steps=5, check_val_every_n_epoch=1\n",
+    "    )\n",
+    "    return trainer"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[NeMo I 2024-07-08 23:55:18 collections:317] Dataset loaded with 18351 files totalling 50.98 hours\n",
-      "[NeMo I 2024-07-08 23:55:18 collections:319] 0 files were filtered totalling 0.00 hours\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "with open_dict(model.cfg):\n",
-    "  # Train Dataloader\n",
-    "  model.cfg.train_ds.manifest_filepath = TRAIN_MANIFEST\n",
-    "  model.cfg.train_ds.batch_size = 32\n",
-    "  model.cfg.train_ds.is_tarred = False\n",
-    "  model.cfg.train_ds.tarred_audio_filepaths = None\n",
+    "# Function to set up experiment manager\n",
+    "def setup_exp_manager(trainer, model):\n",
+    "    os.environ.pop('NEMO_EXPM_VERSION', None)\n",
     "\n",
-    "  model.cfg.validation_ds.manifest_filepath = TEST_MANIFEST\n",
-    "  model.cfg.validation_ds.batch_size = 32\n",
+    "    exp_config = exp_manager.ExpManagerConfig(\n",
+    "        exp_dir=model.cfg.exp_dir,\n",
+    "        name=f'{model.cfg.wandb.run_name}',\n",
+    "        checkpoint_callback_params=exp_manager.CallbackParams(\n",
+    "            monitor=\"val_u_wer\",\n",
+    "            mode=\"min\",\n",
+    "            always_save_nemo=True,\n",
+    "            save_best_model=True,\n",
+    "        ),\n",
+    "        create_wandb_logger=model.cfg.wandb.create_wandb_logger,\n",
+    "        wandb_logger_kwargs=OmegaConf.create({\"project\": f\"{model.cfg.wandb.project}\", \"name\": f\"{model.cfg.wandb.run_name}\", \"log_model\": model.cfg.wandb.log_model}),\n",
+    "    )\n",
     "\n",
-    "model.setup_training_data(model.cfg.train_ds)"
+    "    exp_config = OmegaConf.structured(exp_config)\n",
+    "    logdir = exp_manager.exp_manager(trainer, exp_config)\n",
+    "    if model.cfg.wandb.create_wandb_logger:\n",
+    "        trainer.loggers[1].log_hyperparams(OmegaConf.to_container(model.cfg)) # wandb logger\n",
+    "    return logdir\n"
    ]
   },
   {
@@ -110,18 +127,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# model.summarize()\n",
-    "model.freeze()\n",
-    "# model.summarize()\n",
-    "modules_to_train = [model.a_linear, model.v_linear, model.av_encoder, model.av_enocder_layer, model.a_modal_embs\n",
-    "                    , model.v_modal_embs, model.decoder]\n",
-    "for module in modules_to_train:\n",
-    "    module.train()\n",
-    "    for param in module.parameters():\n",
-    "        param.requires_grad = True\n",
-    "    \n",
-    "\n",
-    "# model.summarize()"
+    "final_results = {}"
    ]
   },
   {
@@ -129,89 +135,172 @@
    "execution_count": 5,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-08-02 11:46:40 mixins:172] Tokenizer SentencePieceTokenizer initialized with 356 tokens\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-08-02 11:46:42 collections:321] Dataset loaded with 22247 files totalling 61.80 hours\n",
+      "[NeMo I 2024-08-02 11:46:42 collections:323] 0 files were filtered totalling 0.00 hours\n",
+      "[NeMo I 2024-08-02 11:46:42 collections:321] Dataset loaded with 2447 files totalling 6.80 hours\n",
+      "[NeMo I 2024-08-02 11:46:42 collections:323] 0 files were filtered totalling 0.00 hours\n",
+      "[NeMo I 2024-08-02 11:46:42 collections:321] Dataset loaded with 2447 files totalling 6.80 hours\n",
+      "[NeMo I 2024-08-02 11:46:42 collections:323] 0 files were filtered totalling 0.00 hours\n",
+      "[NeMo I 2024-08-02 11:46:42 cloud:58] Found existing object /home/bld56/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.\n",
+      "[NeMo I 2024-08-02 11:46:42 cloud:64] Re-using file from: /home/bld56/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo\n",
+      "[NeMo I 2024-08-02 11:46:42 common:815] Instantiating model from pre-trained checkpoint\n",
+      "Updated encoder _target_ model : nemo.collections.asr.modules.conformer_encoder.ConformerEncoderAdapter\n",
+      "[NeMo I 2024-08-02 11:46:42 cloud:58] Found existing object /home/bld56/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.\n",
+      "[NeMo I 2024-08-02 11:46:42 cloud:64] Re-using file from: /home/bld56/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo\n",
+      "[NeMo I 2024-08-02 11:46:42 common:815] Instantiating model from pre-trained checkpoint\n",
+      "[NeMo I 2024-08-02 11:46:43 mixins:172] Tokenizer SentencePieceTokenizer initialized with 128 tokens\n"
+     ]
+    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "INFO: GPU available: True (cuda), used: True\n",
-      "WARNING: Logging before flag parsing goes to stderr.\n",
-      "I0708 23:55:19.239430 134716664817472 rank_zero.py:64] GPU available: True (cuda), used: True\n",
-      "INFO: TPU available: False, using: 0 TPU cores\n",
-      "I0708 23:55:19.266383 134716664817472 rank_zero.py:64] TPU available: False, using: 0 TPU cores\n",
-      "INFO: HPU available: False, using: 0 HPUs\n",
-      "I0708 23:55:19.267290 134716664817472 rank_zero.py:64] HPU available: False, using: 0 HPUs\n"
+      "[NeMo W 2024-08-02 11:46:43 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
+      "    Train config : \n",
+      "    manifest_filepath:\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/tarred_audio_manifest.json\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket2/tarred_audio_manifest.json\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket3/tarred_audio_manifest.json\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket4/tarred_audio_manifest.json\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket5/tarred_audio_manifest.json\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket6/tarred_audio_manifest.json\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket7/tarred_audio_manifest.json\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket8/tarred_audio_manifest.json\n",
+      "    sample_rate: 16000\n",
+      "    batch_size: 1\n",
+      "    shuffle: true\n",
+      "    num_workers: 4\n",
+      "    pin_memory: true\n",
+      "    use_start_end_token: false\n",
+      "    trim_silence: false\n",
+      "    max_duration: 20.0\n",
+      "    min_duration: 0.1\n",
+      "    is_tarred: true\n",
+      "    tarred_audio_filepaths:\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/audio__OP_0..8191_CL_.tar\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket2/audio__OP_0..8191_CL_.tar\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket3/audio__OP_0..8191_CL_.tar\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket4/audio__OP_0..8191_CL_.tar\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket5/audio__OP_0..8191_CL_.tar\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket6/audio__OP_0..8191_CL_.tar\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket7/audio__OP_0..8191_CL_.tar\n",
+      "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket8/audio__OP_0..8191_CL_.tar\n",
+      "    shuffle_n: 2048\n",
+      "    bucketing_strategy: synced_randomized\n",
+      "    bucketing_batch_size:\n",
+      "    - 34\n",
+      "    - 30\n",
+      "    - 26\n",
+      "    - 22\n",
+      "    - 18\n",
+      "    - 16\n",
+      "    - 12\n",
+      "    - 8\n",
+      "    \n",
+      "[NeMo W 2024-08-02 11:46:43 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
+      "    Validation config : \n",
+      "    manifest_filepath:\n",
+      "    - /manifests/librispeech/librivox-dev-other.json\n",
+      "    - /manifests/librispeech/librivox-dev-clean.json\n",
+      "    - /manifests/librispeech/librivox-test-other.json\n",
+      "    - /manifests/librispeech/librivox-test-clean.json\n",
+      "    sample_rate: 16000\n",
+      "    batch_size: 32\n",
+      "    shuffle: false\n",
+      "    num_workers: 8\n",
+      "    pin_memory: true\n",
+      "    use_start_end_token: false\n",
+      "    \n",
+      "[NeMo W 2024-08-02 11:46:43 modelPT:189] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a valid configuration file to setup the test data loader(s).\n",
+      "    Test config : \n",
+      "    manifest_filepath:\n",
+      "    - /manifests/librispeech/librivox-dev-other.json\n",
+      "    - /manifests/librispeech/librivox-dev-clean.json\n",
+      "    - /manifests/librispeech/librivox-test-other.json\n",
+      "    - /manifests/librispeech/librivox-test-clean.json\n",
+      "    sample_rate: 16000\n",
+      "    batch_size: 32\n",
+      "    shuffle: false\n",
+      "    num_workers: 8\n",
+      "    pin_memory: true\n",
+      "    use_start_end_token: false\n",
+      "    \n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[NeMo I 2024-07-08 23:55:19 exp_manager:396] Experiments will be logged at test_experiments/test_wpe_quartz/2024-07-08_23-55-19\n",
-      "[NeMo I 2024-07-08 23:55:19 exp_manager:856] TensorboardLogger has been set up\n",
-      "[NeMo I 2024-07-08 23:55:19 exp_manager:871] WandBLogger has been set up\n"
+      "[NeMo I 2024-08-02 11:46:43 features:305] PADDING: 0\n",
+      "[NeMo I 2024-08-02 11:46:46 save_restore_connector:263] Model EncDecCTCModelBPE was successfully restored from /home/bld56/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.\n",
+      "[NeMo I 2024-08-02 11:46:48 collections:321] Dataset loaded with 22247 files totalling 61.80 hours\n",
+      "[NeMo I 2024-08-02 11:46:48 collections:323] 0 files were filtered totalling 0.00 hours\n",
+      "[NeMo I 2024-08-02 11:46:48 adapter_mixins:719] Setting adapter 'AV_v1' status : Enabled = False\n",
+      "[NeMo I 2024-08-02 11:46:48 adapter_mixins:734] Setting adapter 'AV_v1' status : Enabled = True\n",
+      "[NeMo I 2024-08-02 11:46:48 adapter_mixins:405] Froze module encoder.layers.0.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+      "[NeMo I 2024-08-02 11:46:48 adapter_mixins:405] Froze module encoder.layers.1.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+      "[NeMo I 2024-08-02 11:46:48 adapter_mixins:405] Froze module encoder.layers.2.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+      "[NeMo I 2024-08-02 11:46:48 adapter_mixins:405] Froze module encoder.layers.3.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+      "[NeMo I 2024-08-02 11:46:48 adapter_mixins:405] Froze module encoder.layers.4.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+      "[NeMo I 2024-08-02 11:46:48 adapter_mixins:405] Froze module encoder.layers.5.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+      "[NeMo I 2024-08-02 11:46:48 adapter_mixins:405] Froze module encoder.layers.6.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+      "[NeMo I 2024-08-02 11:46:48 adapter_mixins:405] Froze module encoder.layers.7.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+      "[NeMo I 2024-08-02 11:46:48 adapter_mixins:405] Froze module encoder.layers.8.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+      "[NeMo I 2024-08-02 11:46:48 adapter_mixins:405] Froze module encoder.layers.9.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+      "[NeMo I 2024-08-02 11:46:48 adapter_mixins:405] Froze module encoder.layers.10.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+      "[NeMo I 2024-08-02 11:46:48 adapter_mixins:405] Froze module encoder.layers.11.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+      "[NeMo I 2024-08-02 11:46:48 adapter_mixins:405] Froze module encoder.layers.12.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+      "[NeMo I 2024-08-02 11:46:48 adapter_mixins:405] Froze module encoder.layers.13.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+      "[NeMo I 2024-08-02 11:46:48 adapter_mixins:405] Froze module encoder.layers.14.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+      "[NeMo I 2024-08-02 11:46:48 adapter_mixins:405] Froze module encoder.layers.15.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+      "[NeMo I 2024-08-02 11:46:48 adapter_mixins:405] Froze module encoder.layers.16.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+      "[NeMo I 2024-08-02 11:46:48 adapter_mixins:405] Froze module encoder.layers.17.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+      "[NeMo I 2024-08-02 11:46:48 adapter_mixins:435] Unfrozen adapter : AV_v1\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[NeMo W 2024-07-08 23:55:19 exp_manager:966] The checkpoint callback was told to monitor a validation value and trainer's max_steps was set to 3000. Please ensure that max_steps will run for at least 3 epochs to ensure that checkpointing will not error out.\n"
+      "[NeMo W 2024-08-02 11:46:48 nemo_logging:349] /home/bld56/.miniconda3/envs/nemo/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/bld56/.miniconda3/envs/nemo/lib/python3.10/sit ...\n",
+      "    \n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "HPU available: False, using: 0 HPUs\n"
      ]
-    }
-   ],
-   "source": [
-    "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n",
-    "max_steps = 3000\n",
-    "\n",
-    "trainer = Trainer(devices=1, accelerator=accelerator, max_steps=max_steps,\n",
-    "                  enable_checkpointing=False, logger=False,\n",
-    "                  log_every_n_steps=5, check_val_every_n_epoch=3)\n",
-    "\n",
-    "model.set_trainer(trainer)\n",
-    "\n",
-    "# Prepare NeMo's Experiment manager to handle checkpoint saving and logging for us\n",
-    "from nemo.utils import exp_manager\n",
-    "\n",
-    "\n",
-    "# Environment variable generally used for multi-node multi-gpu training.\n",
-    "# In notebook environments, this flag is unnecessary and can cause logs of multiple training runs to overwrite each other.\n",
-    "os.environ.pop('NEMO_EXPM_VERSION', None)\n",
-    "\n",
-    "exp_config = exp_manager.ExpManagerConfig(\n",
-    "    exp_dir=f'test_experiments/',\n",
-    "    name=f\"test_wpe_quartz\",\n",
-    "    checkpoint_callback_params=exp_manager.CallbackParams(\n",
-    "        monitor=\"val_wer\",\n",
-    "        mode=\"min\",\n",
-    "        always_save_nemo=True,\n",
-    "        save_best_model=True,\n",
-    "    ),\n",
-    "    create_wandb_logger=True,\n",
-    "    wandb_logger_kwargs=OmegaConf.create({\"project\": \"NEMO_TEST\", \"name\": \"test_wpe_quartz\", \"log_model\":\"all\"}),\n",
-    ")\n",
-    "\n",
-    "exp_config = OmegaConf.structured(exp_config)\n",
-    "\n",
-    "logdir = exp_manager.exp_manager(trainer, exp_config)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-08-02 11:46:48 exp_manager:396] Experiments will be logged at /tmp/bld56_dataset_v1/tmp/au_ndec_lman_ntok_NArch_0.5/2024-08-02_11-46-48\n",
+      "[NeMo I 2024-08-02 11:46:48 exp_manager:856] TensorboardLogger has been set up\n",
+      "[NeMo I 2024-08-02 11:46:48 exp_manager:871] WandBLogger has been set up\n"
+     ]
+    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "E0708 23:55:20.059351 134716664817472 jupyter.py:224] Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n",
+      "Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mlakshmipathi-balaji\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
      ]
     },
     {
      "data": {
       "text/html": [
-       "Tracking run with wandb version 0.17.4"
+       "Tracking run with wandb version 0.17.5"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -223,7 +312,7 @@
     {
      "data": {
       "text/html": [
-       "Run data is saved locally in <code>test_experiments/wandb/run-20240708_235520-2024-07-08_23-55-19</code>"
+       "Run data is saved locally in <code>/tmp/bld56_dataset_v1/tmp/wandb/run-20240802_114650-2024-08-02_11-46-48</code>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -235,7 +324,7 @@
     {
      "data": {
       "text/html": [
-       "Syncing run <strong><a href='https://wandb.ai/lakshmipathi-balaji/NEMO_TEST/runs/2024-07-08_23-55-19' target=\"_blank\">test_wpe_quartz</a></strong> to <a href='https://wandb.ai/lakshmipathi-balaji/NEMO_TEST' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+       "Syncing run <strong><a href='https://wandb.ai/lakshmipathi-balaji/NEMO_TEST/runs/2024-08-02_11-46-48' target=\"_blank\">au_ndec_lman_ntok_NArch_0.5</a></strong> to <a href='https://wandb.ai/lakshmipathi-balaji/NEMO_TEST' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -259,7 +348,7 @@
     {
      "data": {
       "text/html": [
-       " View run at <a href='https://wandb.ai/lakshmipathi-balaji/NEMO_TEST/runs/2024-07-08_23-55-19' target=\"_blank\">https://wandb.ai/lakshmipathi-balaji/NEMO_TEST/runs/2024-07-08_23-55-19</a>"
+       " View run at <a href='https://wandb.ai/lakshmipathi-balaji/NEMO_TEST/runs/2024-08-02_11-46-48' target=\"_blank\">https://wandb.ai/lakshmipathi-balaji/NEMO_TEST/runs/2024-08-02_11-46-48</a>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -267,63 +356,86 @@
      },
      "metadata": {},
      "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
-     ]
-    },
+    }
+   ],
+   "source": [
+    "# snr_list = [1,0.95,0.9,0.85,...0.5]\n",
+    "# snr_list = [1, 0.95, 0.9, 0.85, 0.8, 0.75, 0.7, 0.65, 0.6, 0.55, 0.5]\n",
+    "snr_list = [0.5]\n",
+    "for snr in snr_list:\n",
+    "    config_file_path = \"/home/bld56/gsoc/nemo/NeMo-opensource/balu_codes/configs/c3_au_with_same_av_arch.yaml\"\n",
+    "    model, conf = load_and_configure_model(config_file_path)\n",
+    "    manage_model_adapters(model, conf)\n",
+    "\n",
+    "    trainer = setup_trainer()\n",
+    "    model.set_trainer(trainer)\n",
+    "    logdir = setup_exp_manager(trainer, model)\n",
+    "    warnings.filterwarnings(\"ignore\", category=UserWarning, message=\"PySoundFile failed. Trying audioread instead.\")\n",
+    "    warnings.filterwarnings(\"ignore\", category=FutureWarning, message=\"librosa.core.audio.__audioread_load\\n\\tDeprecated as of librosa version 0.10.0.\\n\\tIt will be removed in librosa version 1.0.\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[NeMo I 2024-07-08 23:55:21 modelPT:767] Optimizer config = Novograd (\n",
-      "    Parameter Group 0\n",
-      "        amsgrad: False\n",
-      "        betas: [0.8, 0.5]\n",
-      "        eps: 1e-08\n",
-      "        grad_averaging: False\n",
-      "        lr: 0.01\n",
-      "        weight_decay: 0.001\n",
-      "    )\n",
-      "[NeMo I 2024-07-08 23:55:21 lr_scheduler:772] Scheduler not initialized as no `sched` config supplied to setup_optimizer()\n"
-     ]
-    },
+     "data": {
+      "text/plain": [
+       "   | Name             | Type                    | Params | Mode \n",
+       "----------------------------------------------------------------------\n",
+       "0  | a_model          | EncDecCTCModelBPE       | 122 M  | eval \n",
+       "1  | a_linear         | Linear                  | 262 K  | train\n",
+       "2  | v_linear         | Linear                  | 262 K  | train\n",
+       "3  | av_enocder_layer | TransformerEncoderLayer | 3.2 M  | train\n",
+       "4  | av_encoder       | TransformerEncoder      | 12.6 M | train\n",
+       "5  | a_modal_embs     | Embedding               | 512    | train\n",
+       "6  | v_modal_embs     | Embedding               | 512    | train\n",
+       "7  | a_pos_enc        | Embedding               | 5.1 M  | eval \n",
+       "8  | v_pos_enc        | Embedding               | 5.1 M  | eval \n",
+       "9  | decoder          | ConvASRDecoder          | 183 K  | train\n",
+       "10 | loss             | CTCLoss                 | 0      | eval \n",
+       "11 | wer              | AV_WER                  | 0      | eval \n",
+       "----------------------------------------------------------------------\n",
+       "17.7 M    Trainable params\n",
+       "131 M     Non-trainable params\n",
+       "149 M     Total params\n",
+       "597.643   Total estimated model params size (MB)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.summarize()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\n",
-      "  | Name             | Type                    | Params | Mode \n",
-      "---------------------------------------------------------------------\n",
-      "0 | a_model          | EncDecCTCModel          | 18.9 M | eval \n",
-      "1 | a_linear         | Linear                  | 524 K  | train\n",
-      "2 | v_linear         | Linear                  | 262 K  | train\n",
-      "3 | av_enocder_layer | TransformerEncoderLayer | 3.2 M  | train\n",
-      "4 | av_encoder       | TransformerEncoder      | 6.3 M  | train\n",
-      "5 | a_modal_embs     | Embedding               | 512    | train\n",
-      "6 | v_modal_embs     | Embedding               | 512    | train\n",
-      "7 | decoder          | ConvASRDecoder          | 14.9 K | train\n",
-      "8 | loss             | CTCLoss                 | 0      | eval \n",
-      "9 | wer              | WER                     | 0      | eval \n",
-      "---------------------------------------------------------------------\n",
-      "10.3 M    Trainable params\n",
-      "18.9 M    Non-trainable params\n",
-      "29.2 M    Total params\n",
-      "116.740   Total estimated model params size (MB)\n"
+      "[NeMo W 2024-08-02 11:47:06 nemo_logging:349] /home/bld56/.miniconda3/envs/nemo/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/bld56/.miniconda3/envs/nemo/lib/python3.10/sit ...\n",
+      "    \n",
+      "You are using a CUDA device ('NVIDIA L40S') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0ea39ad975454155a69aa466a018c9bc",
+       "model_id": "a5235241cf934dc8b3dfdeda510d7a1e",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Sanity Checking: |          | 0/? [00:00<?, ?it/s]"
+       "Validation: |          | 0/? [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
@@ -333,37 +445,87 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[NeMo W 2024-07-08 23:55:21 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.\n",
-      "    \n",
-      "[NeMo W 2024-07-08 23:55:22 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.\n",
+      "[NeMo W 2024-08-02 11:47:22 nemo_logging:349] /home/bld56/.miniconda3/envs/nemo/lib/python3.10/site-packages/torch/nn/modules/conv.py:456: UserWarning: Applied workaround for CuDNN issue, install nvrtc.so (Triggered internally at /opt/conda/conda-bld/pytorch_1712608839953/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:84.)\n",
+      "      return F.conv2d(input, weight, bias, self.stride,\n",
       "    \n"
      ]
     },
     {
      "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "91a81cca20ec4a6a8abc39f8a4b882dd",
-       "version_major": 2,
-       "version_minor": 0
-      },
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n",
+       "┃<span style=\"font-weight: bold\">      Validate metric      </span>┃<span style=\"font-weight: bold\">       DataLoader 0        </span>┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\">        global_step        </span>│<span style=\"color: #800080; text-decoration-color: #800080\">            0.0            </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\">          val_acc          </span>│<span style=\"color: #800080; text-decoration-color: #800080\">            0.0            </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\">         val_l_wer         </span>│<span style=\"color: #800080; text-decoration-color: #800080\">         1.984375          </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\">         val_loss          </span>│<span style=\"color: #800080; text-decoration-color: #800080\">     1231.60107421875      </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\">         val_u_wer         </span>│<span style=\"color: #800080; text-decoration-color: #800080\">    2.1525423526763916     </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\">          val_wer          </span>│<span style=\"color: #800080; text-decoration-color: #800080\">    2.1525423526763916     </span>│\n",
+       "└───────────────────────────┴───────────────────────────┘\n",
+       "</pre>\n"
+      ],
       "text/plain": [
-       "Training: |          | 0/? [00:00<?, ?it/s]"
+       "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n",
+       "┃\u001b[1m \u001b[0m\u001b[1m     Validate metric     \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m      DataLoader 0       \u001b[0m\u001b[1m \u001b[0m┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n",
+       "│\u001b[36m \u001b[0m\u001b[36m       global_step       \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m           0.0           \u001b[0m\u001b[35m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36m         val_acc         \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m           0.0           \u001b[0m\u001b[35m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36m        val_l_wer        \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m        1.984375         \u001b[0m\u001b[35m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36m        val_loss         \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m    1231.60107421875     \u001b[0m\u001b[35m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36m        val_u_wer        \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m   2.1525423526763916    \u001b[0m\u001b[35m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36m         val_wer         \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m   2.1525423526763916    \u001b[0m\u001b[35m \u001b[0m│\n",
+       "└───────────────────────────┴───────────────────────────┘\n"
       ]
      },
      "metadata": {},
      "output_type": "display_data"
     },
     {
-     "name": "stdout",
+     "data": {
+      "text/plain": [
+       "[{'global_step': 0.0,\n",
+       "  'val_l_wer': 1.984375,\n",
+       "  'val_u_wer': 2.1525423526763916,\n",
+       "  'val_acc': 0.0,\n",
+       "  'val_loss': 1231.60107421875,\n",
+       "  'val_wer': 2.1525423526763916}]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer.validate(model, model.test_dataloader())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[NeMo I 2024-07-08 23:55:22 preemption:56] Preemption requires torch distributed to be initialized, disabling preemption\n"
+      "[NeMo W 2024-08-01 13:37:16 nemo_logging:349] /home/bld56/.miniconda3/envs/nemo/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/bld56/.miniconda3/envs/nemo/lib/python3.10/sit ...\n",
+      "    \n",
+      "You are using a CUDA device ('NVIDIA L40S') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "76ec198837cb47a5a0d217f8bba173c3",
+       "model_id": "64457a2ff8264c45a48e8df4f89a2ff0",
        "version_major": 2,
        "version_minor": 0
       },
@@ -378,33 +540,143 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "INFO: Epoch 2, global step 1722: 'val_wer' reached 0.26906 (best 0.26906), saving model to '/workspace/nemo/NeMo-opensource/balu_codes/test_experiments/test_wpe_quartz/2024-07-08_23-55-19/checkpoints/test_wpe_quartz--val_wer=0.2691-epoch=2.ckpt' as top 3\n",
-      "I0709 00:11:43.277340 134716664817472 rank_zero.py:64] Epoch 2, global step 1722: 'val_wer' reached 0.26906 (best 0.26906), saving model to '/workspace/nemo/NeMo-opensource/balu_codes/test_experiments/test_wpe_quartz/2024-07-08_23-55-19/checkpoints/test_wpe_quartz--val_wer=0.2691-epoch=2.ckpt' as top 3\n"
+      "[NeMo W 2024-08-01 13:37:21 nemo_logging:349] /home/bld56/.miniconda3/envs/nemo/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...\n",
+      "    \n"
+     ]
+    }
+   ],
+   "source": [
+    "# a = trainer.validate(model, model.val_dataloader())\n",
+    "# final_results[snr] = a[0]['val_u_wer']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[NeMo W 2024-08-01 13:40:08 nemo_logging:349] /home/bld56/.miniconda3/envs/nemo/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/bld56/.miniconda3/envs/nemo/lib/python3.10/sit ...\n",
+      "    \n",
+      "You are using a CUDA device ('NVIDIA L40S') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[NeMo I 2024-07-09 00:11:50 nemo_model_checkpoint:217] New best .nemo model saved to: /workspace/nemo/NeMo-opensource/balu_codes/test_experiments/test_wpe_quartz/2024-07-08_23-55-19/checkpoints/test_wpe_quartz.nemo\n"
+      "[NeMo I 2024-08-01 13:40:09 modelPT:767] Optimizer config = AdamW (\n",
+      "    Parameter Group 0\n",
+      "        amsgrad: False\n",
+      "        betas: [0.9, 0.98]\n",
+      "        capturable: False\n",
+      "        differentiable: False\n",
+      "        eps: 1e-08\n",
+      "        foreach: None\n",
+      "        fused: None\n",
+      "        lr: 2.0\n",
+      "        maximize: False\n",
+      "        weight_decay: 0.001\n",
+      "    )\n",
+      "[NeMo I 2024-08-01 13:40:09 lr_scheduler:923] Scheduler \"<nemo.core.optim.lr_scheduler.NoamAnnealing object at 0x7f4680f3dd20>\" \n",
+      "    will be used during training (effective maximum steps = 139100) - \n",
+      "    Parameters : \n",
+      "    (d_model: 512\n",
+      "    warmup_steps: 2000\n",
+      "    warmup_ratio: null\n",
+      "    min_lr: 1.0e-06\n",
+      "    max_steps: 139100\n",
+      "    )\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "  | Name    | Type              | Params | Mode\n",
+      "-----------------------------------------------------\n",
+      "0 | a_model | EncDecCTCModelBPE | 122 M  | eval\n",
+      "1 | decoder | ConvASRDecoder    | 183 K  | eval\n",
+      "2 | loss    | CTCLoss           | 0      | eval\n",
+      "3 | wer     | AV_WER            | 0      | eval\n",
+      "-----------------------------------------------------\n",
+      "1.2 M     Trainable params\n",
+      "121 M     Non-trainable params\n",
+      "122 M     Total params\n",
+      "491.530   Total estimated model params size (MB)\n"
      ]
     },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "73fda7c1563740c6a0586b11d0af8638",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Sanity Checking: |          | 0/? [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "INFO: `Trainer.fit` stopped: `max_steps=3000` reached.\n",
-      "I0709 00:18:02.561257 134716664817472 rank_zero.py:64] `Trainer.fit` stopped: `max_steps=3000` reached.\n",
-      "INFO: Restoring states from the checkpoint path at /workspace/nemo/NeMo-opensource/balu_codes/test_experiments/test_wpe_quartz/2024-07-08_23-55-19/checkpoints/test_wpe_quartz--val_wer=0.2691-epoch=2.ckpt\n",
-      "I0709 00:18:04.404496 134716664817472 rank_zero.py:64] Restoring states from the checkpoint path at /workspace/nemo/NeMo-opensource/balu_codes/test_experiments/test_wpe_quartz/2024-07-08_23-55-19/checkpoints/test_wpe_quartz--val_wer=0.2691-epoch=2.ckpt\n",
-      "INFO: Restored all states from the checkpoint at /workspace/nemo/NeMo-opensource/balu_codes/test_experiments/test_wpe_quartz/2024-07-08_23-55-19/checkpoints/test_wpe_quartz--val_wer=0.2691-epoch=2.ckpt\n",
-      "I0709 00:18:04.564626 134716664817472 rank_zero.py:64] Restored all states from the checkpoint at /workspace/nemo/NeMo-opensource/balu_codes/test_experiments/test_wpe_quartz/2024-07-08_23-55-19/checkpoints/test_wpe_quartz--val_wer=0.2691-epoch=2.ckpt\n"
+      "[NeMo W 2024-08-01 13:41:21 nemo_logging:349] /home/bld56/.miniconda3/envs/nemo/lib/python3.10/site-packages/torch/nn/modules/conv.py:456: UserWarning: Applied workaround for CuDNN issue, install nvrtc.so (Triggered internally at /opt/conda/conda-bld/pytorch_1712608839953/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:84.)\n",
+      "      return F.conv2d(input, weight, bias, self.stride,\n",
+      "    \n"
+     ]
+    },
+    {
+     "ename": "RuntimeError",
+     "evalue": "blank must be in label range",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/.miniconda3/envs/nemo/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py:543\u001b[0m, in \u001b[0;36mTrainer.fit\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m    541\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mstatus \u001b[38;5;241m=\u001b[39m TrainerStatus\u001b[38;5;241m.\u001b[39mRUNNING\n\u001b[1;32m    542\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 543\u001b[0m \u001b[43mcall\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_and_handle_interrupt\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    544\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_impl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mval_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdatamodule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\n\u001b[1;32m    545\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/.miniconda3/envs/nemo/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py:44\u001b[0m, in \u001b[0;36m_call_and_handle_interrupt\u001b[0;34m(trainer, trainer_fn, *args, **kwargs)\u001b[0m\n\u001b[1;32m     42\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mlauncher \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     43\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mlauncher\u001b[38;5;241m.\u001b[39mlaunch(trainer_fn, \u001b[38;5;241m*\u001b[39margs, trainer\u001b[38;5;241m=\u001b[39mtrainer, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m---> 44\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtrainer_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     46\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m _TunerExitException:\n\u001b[1;32m     47\u001b[0m     _call_teardown_hook(trainer)\n",
+      "File \u001b[0;32m~/.miniconda3/envs/nemo/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py:579\u001b[0m, in \u001b[0;36mTrainer._fit_impl\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m    572\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mfn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    573\u001b[0m ckpt_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_checkpoint_connector\u001b[38;5;241m.\u001b[39m_select_ckpt_path(\n\u001b[1;32m    574\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mfn,\n\u001b[1;32m    575\u001b[0m     ckpt_path,\n\u001b[1;32m    576\u001b[0m     model_provided\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m    577\u001b[0m     model_connected\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlightning_module \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    578\u001b[0m )\n\u001b[0;32m--> 579\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mckpt_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    581\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mstopped\n\u001b[1;32m    582\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
+      "File \u001b[0;32m~/.miniconda3/envs/nemo/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py:986\u001b[0m, in \u001b[0;36mTrainer._run\u001b[0;34m(self, model, ckpt_path)\u001b[0m\n\u001b[1;32m    981\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_signal_connector\u001b[38;5;241m.\u001b[39mregister_signal_handlers()\n\u001b[1;32m    983\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[1;32m    984\u001b[0m \u001b[38;5;66;03m# RUN THE TRAINER\u001b[39;00m\n\u001b[1;32m    985\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[0;32m--> 986\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_stage\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    988\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[1;32m    989\u001b[0m \u001b[38;5;66;03m# POST-Training CLEAN UP\u001b[39;00m\n\u001b[1;32m    990\u001b[0m \u001b[38;5;66;03m# ----------------------------\u001b[39;00m\n\u001b[1;32m    991\u001b[0m log\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: trainer tearing down\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m~/.miniconda3/envs/nemo/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py:1028\u001b[0m, in \u001b[0;36mTrainer._run_stage\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1026\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining:\n\u001b[1;32m   1027\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m isolate_rng():\n\u001b[0;32m-> 1028\u001b[0m         \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_sanity_check\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1029\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mautograd\u001b[38;5;241m.\u001b[39mset_detect_anomaly(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_detect_anomaly):\n\u001b[1;32m   1030\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfit_loop\u001b[38;5;241m.\u001b[39mrun()\n",
+      "File \u001b[0;32m~/.miniconda3/envs/nemo/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py:1057\u001b[0m, in \u001b[0;36mTrainer._run_sanity_check\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1054\u001b[0m call\u001b[38;5;241m.\u001b[39m_call_callback_hooks(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mon_sanity_check_start\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   1056\u001b[0m \u001b[38;5;66;03m# run eval step\u001b[39;00m\n\u001b[0;32m-> 1057\u001b[0m \u001b[43mval_loop\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1059\u001b[0m call\u001b[38;5;241m.\u001b[39m_call_callback_hooks(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mon_sanity_check_end\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   1061\u001b[0m \u001b[38;5;66;03m# reset logger connector\u001b[39;00m\n",
+      "File \u001b[0;32m~/.miniconda3/envs/nemo/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:182\u001b[0m, in \u001b[0;36m_no_grad_context.<locals>._decorator\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    180\u001b[0m     context_manager \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mno_grad\n\u001b[1;32m    181\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m context_manager():\n\u001b[0;32m--> 182\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mloop_run\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/.miniconda3/envs/nemo/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py:135\u001b[0m, in \u001b[0;36m_EvaluationLoop.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    133\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbatch_progress\u001b[38;5;241m.\u001b[39mis_last_batch \u001b[38;5;241m=\u001b[39m data_fetcher\u001b[38;5;241m.\u001b[39mdone\n\u001b[1;32m    134\u001b[0m     \u001b[38;5;66;03m# run step hooks\u001b[39;00m\n\u001b[0;32m--> 135\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_evaluation_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbatch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdataloader_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdataloader_iter\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    136\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m:\n\u001b[1;32m    137\u001b[0m     \u001b[38;5;66;03m# this needs to wrap the `*_step` call too (not just `next`) for `dataloader_iter` support\u001b[39;00m\n\u001b[1;32m    138\u001b[0m     \u001b[38;5;28;01mbreak\u001b[39;00m\n",
+      "File \u001b[0;32m~/.miniconda3/envs/nemo/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py:396\u001b[0m, in \u001b[0;36m_EvaluationLoop._evaluation_step\u001b[0;34m(self, batch, batch_idx, dataloader_idx, dataloader_iter)\u001b[0m\n\u001b[1;32m    390\u001b[0m hook_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_step\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mtesting \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalidation_step\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    391\u001b[0m step_args \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m    392\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_build_step_args_from_hook_kwargs(hook_kwargs, hook_name)\n\u001b[1;32m    393\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m using_dataloader_iter\n\u001b[1;32m    394\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m (dataloader_iter,)\n\u001b[1;32m    395\u001b[0m )\n\u001b[0;32m--> 396\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[43mcall\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_strategy_hook\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrainer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhook_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mstep_args\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    398\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbatch_progress\u001b[38;5;241m.\u001b[39mincrement_processed()\n\u001b[1;32m    400\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m using_dataloader_iter:\n\u001b[1;32m    401\u001b[0m     \u001b[38;5;66;03m# update the hook kwargs now that the step method might have consumed the iterator\u001b[39;00m\n",
+      "File \u001b[0;32m~/.miniconda3/envs/nemo/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py:311\u001b[0m, in \u001b[0;36m_call_strategy_hook\u001b[0;34m(trainer, hook_name, *args, **kwargs)\u001b[0m\n\u001b[1;32m    308\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    310\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mprofiler\u001b[38;5;241m.\u001b[39mprofile(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m[Strategy]\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtrainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhook_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 311\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    313\u001b[0m \u001b[38;5;66;03m# restore current_fx when nested context\u001b[39;00m\n\u001b[1;32m    314\u001b[0m pl_module\u001b[38;5;241m.\u001b[39m_current_fx_name \u001b[38;5;241m=\u001b[39m prev_fx_name\n",
+      "File \u001b[0;32m~/.miniconda3/envs/nemo/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py:411\u001b[0m, in \u001b[0;36mStrategy.validation_step\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    409\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlightning_module:\n\u001b[1;32m    410\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_redirection(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlightning_module, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalidation_step\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 411\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlightning_module\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidation_step\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/gsoc/nemo/NeMo-opensource/nemo/collections/asr/models/av_ctc_models.py:704\u001b[0m, in \u001b[0;36mAV_EncDecCTCModel.validation_step\u001b[0;34m(self, batch, batch_idx, dataloader_idx)\u001b[0m\n\u001b[1;32m    703\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mvalidation_step\u001b[39m(\u001b[38;5;28mself\u001b[39m, batch, batch_idx, dataloader_idx\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m):\n\u001b[0;32m--> 704\u001b[0m     metrics \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidation_pass\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbatch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdataloader_idx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    705\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtrainer\u001b[38;5;241m.\u001b[39mval_dataloaders) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlist\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtrainer\u001b[38;5;241m.\u001b[39mval_dataloaders) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m    706\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvalidation_step_outputs[dataloader_idx]\u001b[38;5;241m.\u001b[39mappend(metrics)\n",
+      "File \u001b[0;32m~/gsoc/nemo/NeMo-opensource/nemo/collections/asr/models/av_ctc_models.py:672\u001b[0m, in \u001b[0;36mAV_EncDecCTCModel.validation_pass\u001b[0;34m(self, batch, batch_idx, dataloader_idx)\u001b[0m\n\u001b[1;32m    665\u001b[0m \u001b[38;5;66;03m# if isinstance(batch, DALIOutputs) and batch.has_processed_signal:\u001b[39;00m\n\u001b[1;32m    666\u001b[0m \u001b[38;5;66;03m#     log_probs, encoded_len, predictions = self.forward(\u001b[39;00m\n\u001b[1;32m    667\u001b[0m \u001b[38;5;66;03m#         processed_signal=signal, processed_signal_length=signal_len\u001b[39;00m\n\u001b[1;32m    668\u001b[0m \u001b[38;5;66;03m#     )\u001b[39;00m\n\u001b[1;32m    669\u001b[0m \u001b[38;5;66;03m# else:\u001b[39;00m\n\u001b[1;32m    670\u001b[0m log_probs, encoded_len, predictions \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mforward(audio_input_signal\u001b[38;5;241m=\u001b[39msignal, audio_input_signal_length\u001b[38;5;241m=\u001b[39msignal_len, video_input_signal\u001b[38;5;241m=\u001b[39mvideo_input_signal)\n\u001b[0;32m--> 672\u001b[0m loss_value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloss\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    673\u001b[0m \u001b[43m    \u001b[49m\u001b[43mlog_probs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlog_probs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtargets\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtranscript\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minput_lengths\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoded_len\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget_lengths\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtranscript_len\u001b[49m\n\u001b[1;32m    674\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    675\u001b[0m loss_value, metrics \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39madd_interctc_losses(\n\u001b[1;32m    676\u001b[0m     loss_value, transcript, transcript_len, compute_wer\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, log_wer_num_denom\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, log_prefix\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mval_\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    677\u001b[0m )\n\u001b[1;32m    679\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mwer\u001b[38;5;241m.\u001b[39mupdate(\n\u001b[1;32m    680\u001b[0m     predictions\u001b[38;5;241m=\u001b[39mlog_probs, targets\u001b[38;5;241m=\u001b[39mtranscript, targets_lengths\u001b[38;5;241m=\u001b[39mtranscript_len, predictions_lengths\u001b[38;5;241m=\u001b[39mencoded_len,\n\u001b[1;32m    681\u001b[0m )\n",
+      "File \u001b[0;32m~/.miniconda3/envs/nemo/lib/python3.10/site-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1530\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/.miniconda3/envs/nemo/lib/python3.10/site-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1539\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1540\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1544\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m~/gsoc/nemo/NeMo-opensource/nemo/core/classes/common.py:1064\u001b[0m, in \u001b[0;36mtypecheck.__call__\u001b[0;34m(self, wrapped, instance, args, kwargs)\u001b[0m\n\u001b[1;32m   1061\u001b[0m instance\u001b[38;5;241m.\u001b[39m_validate_input_types(input_types\u001b[38;5;241m=\u001b[39minput_types, ignore_collections\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mignore_collections, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1063\u001b[0m \u001b[38;5;66;03m# Call the method - this can be forward, or any other callable method\u001b[39;00m\n\u001b[0;32m-> 1064\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mwrapped\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1066\u001b[0m instance\u001b[38;5;241m.\u001b[39m_attach_and_validate_output_types(\n\u001b[1;32m   1067\u001b[0m     output_types\u001b[38;5;241m=\u001b[39moutput_types, ignore_collections\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mignore_collections, out_objects\u001b[38;5;241m=\u001b[39moutputs\n\u001b[1;32m   1068\u001b[0m )\n\u001b[1;32m   1070\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m outputs\n",
+      "File \u001b[0;32m~/gsoc/nemo/NeMo-opensource/nemo/collections/asr/losses/ctc.py:77\u001b[0m, in \u001b[0;36mCTCLoss.forward\u001b[0;34m(self, log_probs, targets, input_lengths, target_lengths)\u001b[0m\n\u001b[1;32m     75\u001b[0m \u001b[38;5;66;03m# here we transpose because we expect [B, T, D] while PyTorch assumes [T, B, D]\u001b[39;00m\n\u001b[1;32m     76\u001b[0m log_probs \u001b[38;5;241m=\u001b[39m log_probs\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m0\u001b[39m)\n\u001b[0;32m---> 77\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     78\u001b[0m \u001b[43m    \u001b[49m\u001b[43mlog_probs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlog_probs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtargets\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtargets\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minput_lengths\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_lengths\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget_lengths\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtarget_lengths\u001b[49m\n\u001b[1;32m     79\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     80\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_apply_reduction:\n\u001b[1;32m     81\u001b[0m     loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreduce(loss, target_lengths)\n",
+      "File \u001b[0;32m~/.miniconda3/envs/nemo/lib/python3.10/site-packages/torch/nn/modules/loss.py:1785\u001b[0m, in \u001b[0;36mCTCLoss.forward\u001b[0;34m(self, log_probs, targets, input_lengths, target_lengths)\u001b[0m\n\u001b[1;32m   1784\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m-> 1785\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mctc_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlog_probs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtargets\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minput_lengths\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget_lengths\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mblank\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreduction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1786\u001b[0m \u001b[43m                      \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mzero_infinity\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/.miniconda3/envs/nemo/lib/python3.10/site-packages/torch/nn/functional.py:2687\u001b[0m, in \u001b[0;36mctc_loss\u001b[0;34m(log_probs, targets, input_lengths, target_lengths, blank, reduction, zero_infinity)\u001b[0m\n\u001b[1;32m   2680\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_variadic(log_probs, targets, input_lengths, target_lengths):\n\u001b[1;32m   2681\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[1;32m   2682\u001b[0m         ctc_loss,\n\u001b[1;32m   2683\u001b[0m         (log_probs, targets, input_lengths, target_lengths),\n\u001b[1;32m   2684\u001b[0m         log_probs, targets, input_lengths, target_lengths,\n\u001b[1;32m   2685\u001b[0m         blank\u001b[38;5;241m=\u001b[39mblank, reduction\u001b[38;5;241m=\u001b[39mreduction, zero_infinity\u001b[38;5;241m=\u001b[39mzero_infinity\n\u001b[1;32m   2686\u001b[0m     )\n\u001b[0;32m-> 2687\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mctc_loss\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2688\u001b[0m \u001b[43m    \u001b[49m\u001b[43mlog_probs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtargets\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minput_lengths\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget_lengths\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mblank\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_Reduction\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_enum\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreduction\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mzero_infinity\u001b[49m\n\u001b[1;32m   2689\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: blank must be in label range"
      ]
     }
    ],
    "source": [
     "trainer.fit(model)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.summarize()"
+   ]
   }
  ],
  "metadata": {
@@ -423,7 +695,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.10.13"
   }
  },
  "nbformat": 4,
diff --git a/balu_codes/train_av_asr.py b/balu_codes/train_av_asr.py
new file mode 100644
index 000000000000..bff0e3acbd6a
--- /dev/null
+++ b/balu_codes/train_av_asr.py
@@ -0,0 +1,135 @@
+import os
+import sys
+sys.path.insert(0, os.path.abspath('/home/bld56/gsoc/nemo/NeMo-opensource/'))
+import nemo.core as nemo_core
+from nemo.core import adapter_mixins
+from nemo.utils import exp_manager
+import nemo.collections.asr as nemo_asr
+import nemo
+import json
+from omegaconf import OmegaConf, open_dict
+import torch
+from pytorch_lightning import Trainer
+from lightning.pytorch.loggers import WandbLogger
+from torchmetrics.text import WordErrorRate
+import warnings
+import argparse
+
+# Function to load and configure the model
+def load_and_configure_model(config_file_path):
+    conf = OmegaConf.load(config_file_path)
+    overrides = OmegaConf.from_cli()
+    updated_conf = OmegaConf.merge(conf, overrides)
+    OmegaConf.set_struct(updated_conf, True)
+    model = nemo_asr.models.AV_EncDecCTCModelBPE(updated_conf)
+
+    model.setup_training_data(model.cfg.train_ds)
+    return model, conf
+
+# Function to freeze and unfreeze model parameters based on adapters
+def manage_model_adapters(model, conf):
+    # Freeze the entire model
+    model.freeze()
+    
+    # Determine which modules to train based on configuration
+    if model.cfg.use_video_modality:
+        modules_to_train = [
+            model.a_linear, model.v_linear, model.av_encoder, model.av_enocder_layer, 
+            model.a_modal_embs, model.v_modal_embs, model.decoder, model.a_pos_enc, model.v_pos_enc
+        ]
+    elif not model.cfg.use_video_modality and model.cfg.use_pretrained_dec:
+        modules_to_train = [model.a_model.decoder]
+    else:  # not model.cfg.use_video_modality and not model.cfg.use_pretrained_dec
+        modules_to_train = [model.decoder]
+    
+    # Set the selected modules to training mode and enable gradients
+    for module in modules_to_train:
+        module.train()
+        for param in module.parameters():
+            param.requires_grad = True
+
+    # Handle adapter configurations if needed
+    if conf.adapters.linear_adapter.keep:
+        model.a_model.freeze()
+        model.a_model.set_enabled_adapters(enabled=False)
+        model.a_model.set_enabled_adapters(name=conf.adapters.linear_adapter.name, enabled=True)
+        model.a_model.unfreeze_enabled_adapters()
+    else:
+        model.a_model.unfreeze()
+
+# Function to set up the trainer
+def setup_trainer():
+    torch.set_float32_matmul_precision('high')
+    accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
+    trainer = Trainer(
+        devices=-1, accelerator=accelerator, strategy="ddp_find_unused_parameters_true",
+        max_epochs=100,
+        enable_checkpointing=False, logger=False,
+        log_every_n_steps=5, check_val_every_n_epoch=1,
+    )
+    return trainer
+
+# Function to set up experiment manager
+def setup_exp_manager(trainer, model):
+    os.environ.pop('NEMO_EXPM_VERSION', None)
+
+    exp_config = exp_manager.ExpManagerConfig(
+        exp_dir=model.cfg.exp_dir,
+        name=f'{model.cfg.wandb.run_name}',
+        checkpoint_callback_params=exp_manager.CallbackParams(
+            monitor="val_u_wer",
+            mode="min",
+            always_save_nemo=True,
+            save_best_model=True,
+        ),
+        create_wandb_logger=model.cfg.wandb.create_wandb_logger,
+        wandb_logger_kwargs=OmegaConf.create({"project": f"{model.cfg.wandb.project}", "name": f"{model.cfg.wandb.run_name}_{model.cfg.train_ds.override_snr_ratio}", "log_model": model.cfg.wandb.log_model}),
+    )
+
+    exp_config = OmegaConf.structured(exp_config)
+    logdir = exp_manager.exp_manager(trainer, exp_config)
+    if model.cfg.wandb.create_wandb_logger:
+        trainer.loggers[1].log_hyperparams(OmegaConf.to_container(model.cfg)) # wandb logger
+        # log the manifest file to wandb server
+        trainer.loggers[1].experiment.log_artifact(f"{model.cfg.train_ds.manifest_filepath}")
+        trainer.loggers[1].experiment.log_artifact(f"{model.cfg.validation_ds.manifest_filepath}")
+        
+    return logdir
+
+# Main function to execute the workflow
+def main(config_file_path, args):
+    model, conf = load_and_configure_model(config_file_path)
+    if args.resume_pretrained:
+        ckpt_path = f"/tmp/bld56_dataset_v1/saved_models/pre_av_ndec_uman_ntok--val_u_wer=0.0809-epoch=11.ckpt"
+        checkpoint = torch.load(ckpt_path)
+        model.load_state_dict(checkpoint['state_dict'])
+        print(model)
+        model.cfg.wandb.run_name += 'pre+'
+    manage_model_adapters(model, conf)
+    
+    trainer = setup_trainer()
+    model.set_trainer(trainer)
+    logdir = setup_exp_manager(trainer, model)
+    trainer.fit(model)
+    # trainer.validate(model)
+
+if __name__ == "__main__":
+    # add config number args
+    parser = argparse.ArgumentParser(description='Train AV ASR model')
+    parser.add_argument('--config', type=int, default=5, help='Config number to use for training')
+    parser.add_argument('--snr', type=float, default=0.7, help='SNR ratio to use for training')
+    parser.add_argument('--gpus', type=int, default=1, help='Number of GPUs to use for training')
+    parser.add_argument('--resume_pretrained', type=bool, default=False, help='Resume training from pretrained model')
+    args = parser.parse_args()
+    config_file_path = f"/home/bld56/gsoc/nemo/NeMo-opensource/balu_codes/configs/c{args.config}.yaml"
+    # load yaml file
+    with open(config_file_path) as file:
+        config = OmegaConf.load(file)
+    config['train_ds']['override_snr_ratio'] = args.snr
+    config['validation_ds']['override_snr_ratio'] = args.snr
+    config['test_ds']['override_snr_ratio'] = args.snr
+    with open(config_file_path, 'w') as file:
+        OmegaConf.save(config, file)
+    warnings.filterwarnings("ignore", category=UserWarning, message="PySoundFile failed. Trying audioread instead.")
+    warnings.filterwarnings("ignore", category=FutureWarning, message="librosa.core.audio.__audioread_load\n\tDeprecated as of librosa version 0.10.0.\n\tIt will be removed in librosa version 1.0.")
+    main(config_file_path, args)
diff --git a/balu_codes/train_av_asr.sh b/balu_codes/train_av_asr.sh
new file mode 100644
index 000000000000..6860ae5616c9
--- /dev/null
+++ b/balu_codes/train_av_asr.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+#SBATCH -N 1
+#SBATCH -J "nemo_train"
+#SBATCH -p gpu
+#SBATCH -c 46
+#SBATCH -G 2
+#SBATCH --mem-per-cpu=3G
+#SBATCH -o "/home/bld56/gsoc/nemo/NeMo-opensource/balu_codes/nemo_train/%j.log"
+#SBATCH -w "gput067"
+#SBATCH --time="2-00:00:00"
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user="lakshmipathi.balaji@research.iiit.ac.in"
+
+# bash /home/bld56/gsoc/general/set_up_node.sh
+# export PATH="/home/bld56/.miniconda3/bin:$PATH"
+# export PATH="$HOME/tools:$PATH"
+
+
+cd /home/bld56/gsoc/nemo/NeMo-opensource/balu_codes
+# gput064
+# CUDA_VISIBLE_DEVICES=0 /home/bld56/.miniconda3/envs/nemo/bin/python train_av_asr.py --config 1 --snr 0.6 &
+# sleep 10
+# CUDA_VISIBLE_DEVICES=1 /home/bld56/.miniconda3/envs/nemo/bin/python train_av_asr.py --config 2 --snr 0.6 &
+# sleep 10
+# CUDA_VISIBLE_DEVICES=2 /home/bld56/.miniconda3/envs/nemo/bin/python train_av_asr.py --config 5 --snr 0.6 &
+
+# gput067
+# sleep 10
+# CUDA_VISIBLE_DEVICES=0 /home/bld56/.miniconda3/envs/nemo/bin/python train_av_asr.py --config 6 --snr 0.7 &
+# sleep 10
+# CUDA_VISIBLE_DEVICES=1 /home/bld56/.miniconda3/envs/nemo/bin/python train_av_asr.py --config 1 --snr 0.6 &
+
+
+# gput068
+# sleep 10
+# CUDA_VISIBLE_DEVICES=0 /home/bld56/.miniconda3/envs/nemo/bin/python train_av_asr.py --config 2 --snr 0.6 &
+# sleep 100
+# CUDA_VISIBLE_DEVICES=1 /home/bld56/.miniconda3/envs/nemo/bin/python train_av_asr.py --config 5 --snr 0.6 &
+
+# gput066
+# sleep 100
+# CUDA_VISIBLE_DEVICES=0 /home/bld56/.miniconda3/envs/nemo/bin/python train_av_asr.py --config 6 --snr 0.6 &
+
+# gput065
+# CUDA_VISIBLE_DEVICES=0 /home/bld56/.miniconda3/envs/nemo/bin/python train_av_asr.py --config 1 --snr 0.5 &
+# sleep 100
+# CUDA_VISIBLE_DEVICES=1 /home/bld56/.miniconda3/envs/nemo/bin/python train_av_asr.py --config 5 --snr 0.5 &
+
+# gput063
+# CUDA_VISIBLE_DEVICES=0 /home/bld56/.miniconda3/envs/nemo/bin/python train_av_asr.py --config 6 --snr 0.5 &
+
+# PRETRAINING, gput063
+# bash /home/bld56/gsoc/general/set_up_node.sh
+
+# PRETRAINED USING
+source activate /home/bld56/.miniconda3/envs/nemo
+# bash /home/bld56/gsoc/general/set_up_node.sh
+
+# av_ndec_lman_ntok
+# CUDA_VISIBLE_DEVICES=0 /home/bld56/.miniconda3/envs/nemo/bin/python train_av_asr.py --config 5 --snr 0.5 --resume_pretrained True &
+
+sleep 20
+# av_ndec_uman_ntok
+# CUDA_VISIBLE_DEVICES=1 /home/bld56/.miniconda3/envs/nemo/bin/python train_av_asr.py --config 9 --snr 0.5 --resume_pretrained True &
+
+# au_ndec_lman_ntok
+CUDA_VISIBLE_DEVICES=0 /home/bld56/.miniconda3/envs/nemo/bin/python train_av_asr.py --config 10 --snr 0.5 --resume_pretrained True &
+
+sleep 20
+# au_ndec_uman_ntok
+CUDA_VISIBLE_DEVICES=1 /home/bld56/.miniconda3/envs/nemo/bin/python train_av_asr.py --config 11 --snr 0.5 --resume_pretrained True &
+
+
+wait
\ No newline at end of file
diff --git a/balu_codes/transcribe.py b/balu_codes/transcribe.py
index 73ad418ac2d7..0a8a1d1f6285 100644
--- a/balu_codes/transcribe.py
+++ b/balu_codes/transcribe.py
@@ -1,12 +1,12 @@
 # import nemo.collections.asr as nemo_asr
 import sys
 import os
-sys.path.append(os.path.abspath('/workspace/nemo/NeMo-opensource'))
+sys.path.append(os.path.abspath('/home/bld56/gsoc/nemo/NeMo-opensource'))
 import nemo.collections.asr as nemo_asr
 
 def load_model(model_name):
     model = nemo_asr.models.ASRModel.from_pretrained(model_name)
     return model
-# model = load_model("stt_en_conformer_ctc_large")
-model = load_model("QuartzNet15x5Base-En")
+model = load_model("stt_en_conformer_ctc_large")
+# model = load_model("QuartzNet15x5Base-En")
 model.transcribe(["/disk1/it1/mixed_audios/009LTXtP4vE_c053b1_171114BCPC_SLASH_171114-BC-PC_DOT_mp3_00035.wav"])
\ No newline at end of file
diff --git a/nemo/collections/asr/data/audio_to_text_dataset.py b/nemo/collections/asr/data/audio_to_text_dataset.py
index 25334017d792..1eca0d771189 100644
--- a/nemo/collections/asr/data/audio_to_text_dataset.py
+++ b/nemo/collections/asr/data/audio_to_text_dataset.py
@@ -736,6 +736,8 @@ def get_av_char_dataset(config: dict, augmentor: Optional['AudioAugmentor'] = No
         return_sample_id=config.get('return_sample_id', False),
         channel_selector=config.get('channel_selector', None),
         video_frame_rate=config.get('video_frame_rate', 3),
+        get_vid_feats=config.get('get_vid_feats', False),
+        get_zero_vid_feats = config.get('get_zero_vid_feats', False),
     )
     return dataset
     
@@ -743,7 +745,7 @@ def get_av_to_text_char_dataset_from_config(
     config, local_rank: int, global_rank: int, world_size: int, preprocessor_cfg: Optional[DictConfig] = None
 ):
     """
-    Construct Audio-To-Text Char dataset from a config.
+    Construct AV-To-Text Char dataset from a config.
     Args:
         config: dataset config
         local_rank: model local rank
@@ -903,6 +905,72 @@ def get_audio_to_text_bpe_dataset_from_config(
             dataset = get_bpe_dataset(config=config, tokenizer=tokenizer, augmentor=augmentor)
     return dataset
 
+def get_av_bpe_dataset(
+    config: dict, tokenizer: 'TokenizerSpec', augmentor: Optional['AudioAugmentor'] = None
+) -> av_to_text.AVToBPEDataset:
+    """
+    Instantiates a Byte Pair Encoding / Word Piece Encoding based AudioToBPEDataset.
+
+    Args:
+        config: Config of the AVToBPEDataset.
+        tokenizer: An instance of a TokenizerSpec object.
+        augmentor: Optional AudioAugmentor object for augmentations on audio data.
+
+    Returns:
+        An instance of AVToBPEDataset.
+    """
+    dataset = av_to_text.AVToBPEDataset(
+        manifest_filepath=config['manifest_filepath'],
+        tokenizer=tokenizer,
+        sample_rate=config['sample_rate'],
+        int_values=config.get('int_values', False),
+        augmentor=augmentor,
+        max_duration=config.get('max_duration', None),
+        min_duration=config.get('min_duration', None),
+        max_utts=config.get('max_utts', 0),
+        trim=config.get('trim_silence', False),
+        use_start_end_token=config.get('use_start_end_token', True),
+        return_sample_id=config.get('return_sample_id', False),
+        channel_selector=config.get('channel_selector', None),
+        video_frame_rate=config.get('video_frame_rate', 3),
+        get_vid_feats=config.get('get_vid_feats', False),
+        get_zero_vid_feats = config.get('get_zero_vid_feats', False),
+        override_snr_ratio = config.get('override_snr_ratio', None),
+    )
+    return dataset
+
+def get_av_to_text_bpe_dataset_from_config(
+    config,
+    local_rank: int,
+    global_rank: int,
+    world_size: int,
+    tokenizer,
+    preprocessor_cfg: Optional[DictConfig] = None,
+):
+    """
+    Construct AV-To-Text BPE dataset from a config.
+    Args:
+        config: BPE dataset config
+        local_rank: model local rank
+        global_rank: model global rand
+        world_size: world size
+        tokenizer: BPE tokenizer
+        preprocessor_cfg: preprocessor config, for DALI BPE dataset
+
+    Returns:
+        constructed dataset or None if dataset config is invalid or nothing to load
+    """
+    if 'augmentor' in config:
+        augmentor = process_augmentations(config['augmentor'], global_rank=global_rank, world_size=world_size)
+    else:
+        augmentor = None
+
+    if 'manifest_filepath' in config and config['manifest_filepath'] is None:
+        logging.warning(f"Could not load dataset as `manifest_filepath` was None. Provided config : {config}")
+        return None
+    dataset = get_av_bpe_dataset(config=config, tokenizer=tokenizer, augmentor=augmentor)
+    return dataset
+
 
 class ASRPredictionWriter(BasePredictionWriter):
     def __init__(self, dataset, output_file: str):
diff --git a/nemo/collections/asr/data/av_to_text.py b/nemo/collections/asr/data/av_to_text.py
index 8a1e33d865b6..55ee5034d81d 100644
--- a/nemo/collections/asr/data/av_to_text.py
+++ b/nemo/collections/asr/data/av_to_text.py
@@ -47,6 +47,9 @@
 
 import numpy as np
 
+# FOR NOISE LOADING
+from pydub import AudioSegment
+
 __all__ = [
     'AVToCharDataset',
     'AVToBPEDataset',
@@ -56,7 +59,7 @@
     ['wav', 'mp3', 'flac', 'opus'] + [fmt.lower() for fmt in valid_sf_formats.keys()])
 
 
-def _speech_collate_fn(batch, pad_id):
+def _speech_collate_fn(batch, pad_id, get_vid_feats):
     """collate batch of audio sig, audio len, video sig, tokens, tokens len
     Args:
         batch (Optional[FloatTensor], Optional[LongTensor], Optional[LongTensor],
@@ -65,13 +68,22 @@ def _speech_collate_fn(batch, pad_id):
                assumes the signals are 1d torch tensors (i.e. mono audio).
     """
     packed_batch = list(zip(*batch))
-    if len(packed_batch) == 6:
-        _, audio_lengths, _, _, tokens_lengths, sample_ids = packed_batch
-    elif len(packed_batch) == 5:
-        sample_ids = None
-        _, audio_lengths, _, _, tokens_lengths = packed_batch
+    if get_vid_feats:
+        if len(packed_batch) == 6:
+            _, audio_lengths, _, _, tokens_lengths, sample_ids = packed_batch
+        elif len(packed_batch) == 5:
+            sample_ids = None
+            _, audio_lengths, _, _, tokens_lengths = packed_batch
+        else:
+            raise ValueError(f"Expects 5 or 6 tensors in the batch!")
     else:
-        raise ValueError("Expects 5 or 6 tensors in the batch!")
+        if len(packed_batch) == 4:
+            sample_ids = None
+            _, audio_lengths, _, tokens_lengths = packed_batch
+        elif len(packed_batch) == 5:
+            _, audio_lengths, _, tokens_lengths, sample_ids = packed_batch
+        else:
+            raise ValueError(f"Expects 4 or 5 tensors in the batch!")
     max_audio_len = 0
     has_audio = audio_lengths[0] is not None
     if has_audio:
@@ -80,17 +92,22 @@ def _speech_collate_fn(batch, pad_id):
 
     audio_signal, tokens, video_feat_signal = [], [], []
     for b in batch:
-        if len(b) == 6:
+        if len(b) == 6 and get_vid_feats:
             sig, sig_len, video_feat, tokens_i, tokens_i_len, _ = b
-        else:
+        elif len(b) == 5 and get_vid_feats:
             sig, sig_len, video_feat, tokens_i, tokens_i_len = b
+        elif len(b) == 5 and not get_vid_feats:
+            sig, sig_len, tokens_i, tokens_i_len, _ = b
+        elif len(b) == 4 and not get_vid_feats:
+            sig, sig_len, tokens_i, tokens_i_len = b
         if has_audio:
             sig_len = sig_len.item()
             if sig_len < max_audio_len:
                 pad = (0, max_audio_len - sig_len)
                 sig = torch.nn.functional.pad(sig, pad)
             audio_signal.append(sig)
-        video_feat_signal.append(video_feat)
+        if get_vid_feats:
+            video_feat_signal.append(video_feat)
         tokens_i_len = tokens_i_len.item()
         if tokens_i_len < max_tokens_len:
             pad = (0, max_tokens_len - tokens_i_len)
@@ -102,14 +119,20 @@ def _speech_collate_fn(batch, pad_id):
         audio_lengths = torch.stack(audio_lengths)
     else:
         audio_signal, audio_lengths = None, None
-    video_feat_signal = torch.stack(video_feat_signal)
+    if get_vid_feats:
+        video_feat_signal = torch.stack(video_feat_signal)
     tokens = torch.stack(tokens)
     tokens_lengths = torch.stack(tokens_lengths)
-    if sample_ids is None:
-        return audio_signal, audio_lengths, video_feat_signal, tokens, tokens_lengths
-    else:
+    base_output = [audio_signal, audio_lengths, tokens, tokens_lengths]
+
+    if get_vid_feats:
+        base_output.insert(2, video_feat_signal)
+
+    if sample_ids is not None:
         sample_ids = torch.tensor(sample_ids, dtype=torch.int32)
-        return audio_signal, audio_lengths, video_feat_signal, tokens, tokens_lengths, sample_ids
+        base_output.append(sample_ids)
+
+    return tuple(base_output)
 
 class ASR_AV_ManifestProcessor:
     """
@@ -346,7 +369,7 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]:
         return {
             'audio_signal': [NeuralType(('B', 'T'), AudioSignal())],
             'a_sig_length': [NeuralType(tuple('B'), LengthsType())],
-            'video_input_signal': [NeuralType(('B', 'T', 'D'), ChannelType())],
+            'video_input_signal': [NeuralType(('B', 'T', 'D'), ChannelType(), optional=True)],
             'transcripts': [NeuralType(('B', 'T'), LabelsType())],
             'transcript_length': [NeuralType(tuple('B'), LengthsType())],
             'sample_id': [NeuralType(tuple('B'), LengthsType(), optional=True)],
@@ -369,6 +392,9 @@ def __init__(
         return_sample_id: bool = False,
         channel_selector: Optional[ChannelSelectorType] = None,
         video_frame_rate: int = 5,
+        get_vid_feats: bool = True,
+        get_zero_vid_feats: bool = False,
+        override_snr_ratio: Optional[float] = None,
     ):
         if type(manifest_filepath) == str:
             manifest_filepath = manifest_filepath.split(",")
@@ -394,6 +420,9 @@ def __init__(
         self.return_sample_id = return_sample_id
         self.channel_selector = channel_selector
         self.video_frame_rate = video_frame_rate
+        self.get_vid_feats = get_vid_feats
+        self.get_zero_vid_feats = get_zero_vid_feats
+        self.override_snr_ratio = override_snr_ratio
 
     def get_manifest_sample(self, sample_id):
         return self.manifest_processor.collection[sample_id]
@@ -404,6 +433,39 @@ def __getitem__(self, index):
         else:
             return self._process_sample(index)
 
+    def calculate_rms(self, audio):
+        """Calculate the RMS (root mean square) level of an audio signal."""
+        return torch.sqrt(torch.mean(audio ** 2))
+
+    def adjust_volume(self, audio, target_rms):
+        """Adjust the audio's volume to a target RMS level."""
+        current_rms = self.calculate_rms(audio)
+        return audio * (target_rms / (current_rms + 1e-9))  # Avoid division by zero
+
+    def _mix_audios(self, noisy_audio_feats, clean_audio_feats, snr, target_sr=16000):
+        if self.override_snr_ratio is not None:
+            snr = self.override_snr_ratio
+        rms1 = self.calculate_rms(clean_audio_feats)
+        rms2 = self.calculate_rms(noisy_audio_feats)
+        mean_rms = (rms1 + rms2) / 2
+        
+        noisy_audio_feats = self.adjust_volume(noisy_audio_feats, mean_rms)
+        clean_audio_feats = self.adjust_volume(clean_audio_feats, mean_rms)
+        
+        assert len(clean_audio_feats) >= 10*target_sr, f"Audio length is too short: {len(clean_audio_feats)}"
+        
+        if len(noisy_audio_feats) < len(clean_audio_feats):
+            noisy_audio_feats = torch.nn.functional.pad(noisy_audio_feats, (0, len(clean_audio_feats) - len(noisy_audio_feats)))
+
+        min_len = min(10*target_sr, len(clean_audio_feats))
+        noisy_audio_feats = noisy_audio_feats[:min_len]
+        clean_audio_feats = clean_audio_feats[:min_len]
+        
+        mixed_audio = snr * clean_audio_feats + (1 - snr) * noisy_audio_feats
+        
+        return mixed_audio      
+
+    
     def _process_sample(self, index):
         sample = self.manifest_processor.collection[index]
         offset = sample.offset
@@ -411,7 +473,7 @@ def _process_sample(self, index):
         if offset is None:
             offset = 0
 
-        features = self.featurizer.process(
+        clean_audio_features = self.featurizer.process(
             sample.audio_file,
             offset=offset,
             duration=sample.duration,
@@ -419,35 +481,56 @@ def _process_sample(self, index):
             orig_sr=sample.orig_sr,
             channel_selector=self.channel_selector,
         )
-        f, fl = features, torch.tensor(features.shape[0]).long()
+        if self.override_snr_ratio != 0.0: 
+            audio = AudioSegment.from_file(sample.video_file, format="mp4")
+            samples_pydub = np.array(audio.get_array_of_samples(), dtype=np.float32)
+            noise_features = torch.tensor(samples_pydub, dtype=torch.float32)
+            noise_features = noise_features / (2**(8 * audio.sample_width) / 2)
+            mixed_features = self._mix_audios(noise_features, clean_audio_features, snr = sample.snr)
+        else:
+            mixed_features = clean_audio_features
+        f, fl = mixed_features, torch.tensor(mixed_features.shape[0]).long()
+
+        # TODO: @Balu, saving audio temporarily
+        # save_audio_path = f"/tmp/bld56_dataset_v1/audioset/temp_sample_check/{index}.wav"
+        # import torchaudio
+        # torchaudio.save(save_audio_path, f.unsqueeze(0), 16000)
+        
+        if self.get_vid_feats:
+            if not self.get_zero_vid_feats:
+                # check if file exists
+                assert os.path.exists(
+                    sample.video_featfile), f"Video feature file {sample.video_featfile} does not exist"
+                vf = np.load(sample.video_featfile)
+                # uniformly sample self.video_frame_rate frames from video at shape 0.
+                assert vf.shape[0] == self.video_frame_rate*sample.duration, f"Video feature file {sample.video_featfile} has {vf.shape[0]} frame_feats, expected {self.video_frame_rate}"
+                vf = torch.from_numpy(vf)
+                # make it torch float
+                vf = vf.float()
+            else:
+                vf = torch.zeros(
+                    self.video_frame_rate*sample.duration, 768).float()
+        
+        t, tl = self.manifest_processor.process_text_by_sample(sample=sample)
 
-        # check if file exists
-        assert os.path.exists(
-            sample.video_featfile), f"Video feature file {sample.video_featfile} does not exist"
-        vf = np.load(sample.video_featfile)
-        # uniformly sample self.video_frame_rate frames from video at shape 0.
-        # TODO: @Balu, how would you do this, if you one frame rate then you should make many dirs with different frame rates.
-        assert vf.shape[0] == self.video_frame_rate*sample.duration, f"Video feature file {sample.video_featfile} has {vf.shape[0]} frame_feats, expected {self.video_frame_rate}"
+        output = [f, fl, torch.tensor(t).long(), torch.tensor(tl).long()]
 
-        t, tl = self.manifest_processor.process_text_by_sample(sample=sample)
+        if self.get_vid_feats:
+            output.insert(2, vf)
 
-        vf = torch.from_numpy(vf)
-        # make it torch float
-        vf = vf.float()
         if self.return_sample_id:
-            output = f, fl, vf, torch.tensor(
-                t).long(), torch.tensor(tl).long(), index
-        else:
-            output = f, fl, vf, torch.tensor(
-                t).long(), torch.tensor(tl).long()
+            output.append(index)
+
+        output = tuple(output)
 
         return output
 
     def __len__(self):
+        # return 100
         return len(self.manifest_processor.collection)
 
     def _collate_fn(self, batch):
-        return _speech_collate_fn(batch, pad_id=self.manifest_processor.pad_id)
+        return _speech_collate_fn(batch, pad_id=self.manifest_processor.pad_id, get_vid_feats=self.get_vid_feats)
 
 
 class AVToCharDataset(_AVTextDataset):
@@ -491,7 +574,7 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]:
         return {
             'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
             'a_sig_length': NeuralType(tuple('B'), LengthsType()),
-            'hidden_states': NeuralType(('B', 'T', 'D'), EncodedRepresentation()),
+            'hidden_states': NeuralType(('B', 'T', 'D'), ImageFeatureValue(), optional=True),
             'transcripts': NeuralType(('B', 'T'), LabelsType()),
             'transcript_length': NeuralType(tuple('B'), LengthsType()),
             'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True),
@@ -518,6 +601,9 @@ def __init__(
         return_sample_id: bool = False,
         channel_selector: Optional[ChannelSelectorType] = None,
         video_frame_rate: int = 3,
+        get_vid_feats: bool = True,
+        get_zero_vid_feats: bool = False,
+        override_snr_ratio: Optional[float] = None,
     ):
         self.labels = labels
 
@@ -541,10 +627,13 @@ def __init__(
             return_sample_id=return_sample_id,
             channel_selector=channel_selector,
             video_frame_rate=video_frame_rate,
+            get_vid_feats=get_vid_feats,
+            get_zero_vid_feats=get_zero_vid_feats,
+            override_snr_ratio=override_snr_ratio,
         )
 
 
-class AudioToBPEDataset(_AVTextDataset):
+class AVToBPEDataset(_AVTextDataset):
     """
     Dataset that loads tensors via a json file containing paths to audio
     files, transcripts, and durations (in seconds). Each new line is a
@@ -579,20 +668,31 @@ class AudioToBPEDataset(_AVTextDataset):
             tokens to beginning and ending of speech respectively.
         return_sample_id (bool): whether to return the sample_id as a part of each sample
         channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing.
+        video_frame_rate (int): Frame rate of video, used to calculate duration of video
     """
 
     @property
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
         """Returns definitions of module output ports.
                """
-        return {
-            'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
-            'a_sig_length': NeuralType(tuple('B'), LengthsType()),
-            'video_input_signal': NeuralType(('B', 'T', 'D'), ChannelType()),
-            'transcripts': NeuralType(('B', 'T'), LabelsType()),
-            'transcript_length': NeuralType(tuple('B'), LengthsType()),
-            'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True),
-        }
+        if self.get_vid_feats:
+            return {
+                'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
+                'a_sig_length': NeuralType(tuple('B'), LengthsType()),
+                'hidden_states': NeuralType(('B', 'T', 'D'), ImageFeatureValue(), optional=True),
+                'transcripts': NeuralType(('B', 'T'), LabelsType()),
+                'transcript_length': NeuralType(tuple('B'), LengthsType()),
+                'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True),
+            }
+        else:
+            return {
+                'audio_signal': NeuralType(('B', 'T'), AudioSignal()),
+                'a_sig_length': NeuralType(tuple('B'), LengthsType()),
+                'transcripts': NeuralType(('B', 'T'), LabelsType()),
+                'transcript_length': NeuralType(tuple('B'), LengthsType()),
+                'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True),
+            }
+
 
     def __init__(
         self,
@@ -608,6 +708,10 @@ def __init__(
         use_start_end_token: bool = True,
         return_sample_id: bool = False,
         channel_selector: Optional[ChannelSelectorType] = None,
+        video_frame_rate: int = 3,
+        get_vid_feats: bool = True,
+        get_zero_vid_feats: bool = False,
+        override_snr_ratio: Optional[float] = None,
     ):
         if use_start_end_token and hasattr(tokenizer, "bos_id") and tokenizer.bos_id > 0:
             bos_id = tokenizer.bos_id
@@ -658,4 +762,8 @@ def __call__(self, *args):
             trim=trim,
             return_sample_id=return_sample_id,
             channel_selector=channel_selector,
+            video_frame_rate=video_frame_rate,
+            get_vid_feats=get_vid_feats,
+            get_zero_vid_feats=get_zero_vid_feats,
+            override_snr_ratio=override_snr_ratio,
         )
diff --git a/nemo/collections/asr/metrics/av_wer.py b/nemo/collections/asr/metrics/av_wer.py
new file mode 100644
index 000000000000..26ed7a092f64
--- /dev/null
+++ b/nemo/collections/asr/metrics/av_wer.py
@@ -0,0 +1,263 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple, Union
+
+import editdistance
+import jiwer
+import torch
+from torchmetrics import Metric
+
+from nemo.collections.asr.parts.submodules.ctc_decoding import AbstractCTCDecoding
+from nemo.collections.asr.parts.submodules.multitask_decoding import AbstractMultiTaskDecoding
+from nemo.collections.asr.parts.submodules.rnnt_decoding import AbstractRNNTDecoding
+from nemo.utils import logging
+
+import regex as re
+
+__all__ = ['AV_WER']
+
+def move_dimension_to_the_front(tensor, dim_index):
+    all_dims = list(range(tensor.ndim))
+    return tensor.permute(*([dim_index] + all_dims[:dim_index] + all_dims[dim_index + 1 :]))
+
+
+class AV_WER(Metric):
+    """
+    This metric computes numerator and denominator for Overall Word Error Rate (WER) between prediction and reference
+    texts. When doing distributed training/evaluation the result of ``res=WER(predictions, predictions_lengths, targets, target_lengths)``
+    calls will be all-reduced between all workers using SUM operations. Here ``res`` contains three numbers
+    ``res=[wer, total_levenstein_distance, total_number_of_words]``.
+    
+    This also has options to compute WER with tags, without tags and accuracy for tag prediction too.
+    TODO @Balu: Can also integrate spans of the tag predicted.
+
+    If used with PytorchLightning LightningModule, include wer_numerator and wer_denominators inside validation_step
+    results. Then aggregate (sum) then at the end of validation epoch to correctly compute validation WER.
+
+    Example:
+        def validation_step(self, batch, batch_idx):
+            ...
+            wer_num, wer_denom = self.__wer(predictions, predictions_len, transcript, transcript_len)
+            self.val_outputs = {'val_loss': loss_value, 'val_wer_num': wer_num, 'val_wer_denom': wer_denom}
+            return self.val_outputs
+
+        def on_validation_epoch_end(self):
+            ...
+            wer_num = torch.stack([x['val_wer_num'] for x in self.val_outputs]).sum()
+            wer_denom = torch.stack([x['val_wer_denom'] for x in self.val_outputs]).sum()
+            tensorboard_logs = {'validation_loss': val_loss_mean, 'validation_avg_wer': wer_num / wer_denom}
+            self.val_outputs.clear()  # free memory
+            return {'val_loss': val_loss_mean, 'log': tensorboard_logs}
+
+    Args:
+        decoding: An instance of CTCDecoding or RNNTDecoding.
+        use_cer: Whether to use Character Error Rate instead of Word Error Rate.
+        log_prediction: Whether to log a single decoded sample per call.
+        batch_dim_index: Index corresponding to batch dimension. (For RNNT.)
+        dist_dync_on_step: Whether to perform reduction on forward pass of metric.
+        labelled_manifest: Whether the manifest has labels or not.
+
+    Returns:
+        res: a tuple of 3 zero dimensional float32 ``torch.Tensor` objects: a WER score, a sum of Levenstein's
+            distances for all prediction - reference pairs, total number of words in all references.
+    """
+
+    full_state_update: bool = True
+
+    def __init__(
+        self,
+        decoding: Union[AbstractCTCDecoding, AbstractRNNTDecoding, AbstractMultiTaskDecoding],
+        use_cer=False, 
+        log_prediction=True,
+        fold_consecutive=True,
+        batch_dim_index=0,
+        dist_sync_on_step=False,
+        labelled_manifest=False,
+    ):
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
+
+        self.decoding = decoding
+        self.use_cer = use_cer
+        self.log_prediction = log_prediction
+        self.fold_consecutive = fold_consecutive
+        self.batch_dim_index = batch_dim_index
+
+        self.has_spl_tokens = False
+        self.decode = None
+        if isinstance(self.decoding, AbstractRNNTDecoding):
+            self.decode = lambda predictions, predictions_lengths, predictions_mask, input_ids, targets: self.decoding.rnnt_decoder_predictions_tensor(
+                encoder_output=predictions, encoded_lengths=predictions_lengths
+            )
+        elif isinstance(self.decoding, AbstractCTCDecoding):
+            self.decode = lambda predictions, predictions_lengths, predictions_mask, input_ids, targets: self.decoding.ctc_decoder_predictions_tensor(
+                decoder_outputs=predictions,
+                decoder_lengths=predictions_lengths,
+                fold_consecutive=self.fold_consecutive,
+            )
+        elif isinstance(self.decoding, AbstractMultiTaskDecoding):
+            self.has_spl_tokens = True
+            self.decode = lambda predictions, prediction_lengths, predictions_mask, input_ids, targets: self.decoding.decode_predictions_tensor(
+                encoder_hidden_states=predictions,
+                encoder_input_mask=predictions_mask,
+                decoder_input_ids=input_ids,
+                return_hypotheses=False,
+            )
+        else:
+            raise TypeError(f"WER metric does not support decoding of type {type(self.decoding)}")
+
+        self.add_state("scores_labelled", default=torch.tensor(0), dist_reduce_fx='sum', persistent=False)
+        self.add_state("words_labelled", default=torch.tensor(0), dist_reduce_fx='sum', persistent=False)
+        self.add_state("scores_unlabelled", default=torch.tensor(0), dist_reduce_fx='sum', persistent=False)
+        self.add_state("words_unlabelled", default=torch.tensor(0), dist_reduce_fx='sum', persistent=False)
+        self.add_state("correct_label_count", default=torch.tensor(0), dist_reduce_fx='sum', persistent=False)
+        self.add_state("num_samples", default=torch.tensor(0), dist_reduce_fx='sum', persistent=False)
+        
+        self.labelled_manifest = labelled_manifest
+        
+    def get_words_and_scores(self, hypotheses: List[str], references: List[str], labelled_data: str):
+        words = 0
+        scores = 0
+        
+        for h, r in zip(hypotheses, references):
+            if self.use_cer:
+                h_list = list(h)
+                r_list = list(r)
+            else:
+                h_list = h.split()
+                r_list = r.split()
+            words += len(r_list)
+            # Compute Levenstein's distance
+            scores += editdistance.eval(h_list, r_list)
+            
+        if labelled_data:
+            self.scores_labelled = torch.tensor(scores, device=self.scores_labelled.device, dtype=self.scores_labelled.dtype)
+            self.words_labelled = torch.tensor(words, device=self.words_labelled.device, dtype=self.words_labelled.dtype)
+        else:
+            self.scores_unlabelled = torch.tensor(scores, device=self.scores_unlabelled.device, dtype=self.scores_unlabelled.dtype)
+            self.words_unlabelled = torch.tensor(words, device=self.words_unlabelled.device, dtype=self.words_unlabelled.dtype)
+    
+    def seperate_labels_from_labelled_data(self, hypotheses: List[str], references: List[str]):
+        # labels are in the text of form <N1>...text...<N2>, note it is not only <N1> and <N2> but can be any number of tag marked by <>
+        unlabelled_hypotheses = []
+        unlabelled_references = []
+        labels_hypotheses = []
+        labels_references = []
+        correct_label_count = 0
+        
+        for h, r in zip(hypotheses, references):
+            # identify the tags with <> 
+            # h_tags = [h[i:j+1] for i in range(len(h)) for j in range(i, len(h)) if h[i] == '<' and h[j] == '>']
+            # r_tags = [r[i:j+1] for i in range(len(r)) for j in range(i, len(r)) if r[i] == '<' and r[j] == '>']
+            r_tags = re.findall(r'<N\d+>', r)
+            h_tags = re.findall(r'<N\d+>', h)
+            # assert len(r_tags) == 2, f"Reference tags are not 2, they are {r_tags} for {r}" # Note we are only considering for single label.
+            # Above assert doesnt apply when ps audio is of 15 seconds but words are in only first 4 seconds and noise occurs from 8 to 10 secs.
+            # Replace all tags in the hypothesis and reference
+            unlabelled_h = h
+            unlabelled_r = r
+            for tag in r_tags:
+                unlabelled_h = unlabelled_h.replace(tag, '')
+                unlabelled_r = unlabelled_r.replace(tag, '')
+            
+            unlabelled_hypotheses.append(unlabelled_h)
+            unlabelled_references.append(unlabelled_r)
+            labels_hypotheses.append(h_tags)
+            # FOR IT1
+            # if len(r_tags) == 2: 
+            #     labels_references.append(r_tags[0])
+            # else:
+            #     labels_references.append([])
+            # if len(h_tags) == 2 and len(r_tags) == 2 and h_tags[0] == r_tags[0]:
+            #     correct_label_count += 1
+            
+            # FOR IT2
+            if len(r_tags) == 1:
+                labels_references.append(r_tags[0])
+            else:
+                labels_references.append([])
+            if len(h_tags) == 1 and len(r_tags) == 1 and h_tags[0] == r_tags[0]:
+                correct_label_count += 1
+            
+        return unlabelled_hypotheses, unlabelled_references, labels_hypotheses, labels_references, correct_label_count
+            
+    
+    def update(
+        self,
+        predictions: torch.Tensor,
+        predictions_lengths: torch.Tensor,
+        targets: torch.Tensor,
+        targets_lengths: torch.Tensor,
+        predictions_mask: Optional[torch.Tensor] = None,
+        input_ids: Optional[torch.Tensor] = None,
+    ):
+        """
+        Updates metric state.
+        Args:
+            predictions: an integer torch.Tensor of shape ``[Batch, Time, {Vocabulary}]`` (if ``batch_dim_index == 0``) or
+                ``[Time, Batch]`` (if ``batch_dim_index == 1``)
+            prediction_lengths: an integer torch.Tensor of shape ``[Batch]``
+            targets: an integer torch.Tensor of shape ``[Batch, Time]`` (if ``batch_dim_index == 0``) or
+                ``[Time, Batch]`` (if ``batch_dim_index == 1``)
+            target_lengths: an integer torch.
+            predictions_lengths: an integer torch.Tensor of shape ``[Batch]``
+        """
+        references = []
+        with torch.no_grad():
+            tgt_lenths_cpu_tensor = targets_lengths.long().cpu()
+            targets_cpu_tensor = targets.long().cpu()
+            # check batch_dim_index is first dim
+            if self.batch_dim_index != 0:
+                targets_cpu_tensor = move_dimension_to_the_front(targets_cpu_tensor, self.batch_dim_index)
+            # iterate over batch
+            for ind in range(targets_cpu_tensor.shape[0]):
+                tgt_len = tgt_lenths_cpu_tensor[ind].item()
+                target = targets_cpu_tensor[ind][:tgt_len].numpy().tolist()
+                reference = self.decoding.decode_tokens_to_str(target)
+                references.append(reference)
+            hypotheses, _ = self.decode(predictions, predictions_lengths, predictions_mask, input_ids, targets)
+
+            if self.has_spl_tokens:
+                hypotheses = [self.decoding.strip_special_tokens(hyp) for hyp in hypotheses]
+                references = [self.decoding.strip_special_tokens(ref) for ref in references]
+
+        if self.log_prediction:
+            logging.info(f"\n")
+            logging.info(f"reference:{references[0]}")
+            logging.info(f"predicted:{hypotheses[0]}")
+            logging.info(f"\n")
+
+        unlabelled_hypotheses, unlabelled_references, labels_hypotheses, labels_references, correct_label_count = self.seperate_labels_from_labelled_data(hypotheses, references)
+        self.get_words_and_scores(unlabelled_hypotheses, unlabelled_references, labelled_data=False)
+        self.get_words_and_scores(hypotheses, references, labelled_data=True)
+        self.correct_label_count = torch.tensor(correct_label_count, device=self.correct_label_count.device, dtype=self.correct_label_count.dtype)
+        self.num_samples = torch.tensor(len(references), device=self.num_samples.device, dtype=self.num_samples.dtype)
+        
+
+    def compute(self):
+        if self.labelled_manifest:
+            scores_labelled = self.scores_labelled.detach().float()
+            words_labelled = self.words_labelled.detach().float()
+            labelled_wer = scores_labelled / words_labelled
+        else:
+            scores_labelled = None
+            words_labelled = None
+            labelled_wer = None
+        scores_unlabelled = self.scores_unlabelled.detach().float()
+        words_unlabelled = self.words_unlabelled.detach().float()
+        unlabelled_wer = scores_unlabelled / words_unlabelled
+        correct_label_count = self.correct_label_count.detach().float()
+        num_samples = self.num_samples.detach().float()
+
+        return labelled_wer, unlabelled_wer, correct_label_count/num_samples, scores_unlabelled, words_unlabelled
diff --git a/nemo/collections/asr/models/__init__.py b/nemo/collections/asr/models/__init__.py
index 538b3fcabcf6..4a8772de2c05 100644
--- a/nemo/collections/asr/models/__init__.py
+++ b/nemo/collections/asr/models/__init__.py
@@ -24,6 +24,7 @@
 from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
 from nemo.collections.asr.models.ctc_models import EncDecCTCModel
 from nemo.collections.asr.models.av_ctc_models import AV_EncDecCTCModel
+from nemo.collections.asr.models.av_ctc_bpe_models import AV_EncDecCTCModelBPE
 from nemo.collections.asr.models.enhancement_models import (
     EncMaskDecAudioToAudioModel,
     PredictiveAudioToAudioModel,
diff --git a/nemo/collections/asr/models/av_ctc_bpe_models.py b/nemo/collections/asr/models/av_ctc_bpe_models.py
index 7544ce50e7fe..474beb0796a2 100644
--- a/nemo/collections/asr/models/av_ctc_bpe_models.py
+++ b/nemo/collections/asr/models/av_ctc_bpe_models.py
@@ -22,8 +22,10 @@
 from nemo.collections.asr.data import audio_to_text_dataset
 from nemo.collections.asr.data.audio_to_text import _AudioTextDataset
 from nemo.collections.asr.losses.ctc import CTCLoss
-from nemo.collections.asr.metrics.wer import WER
+from nemo.collections.asr.metrics.av_wer import AV_WER
 from nemo.collections.asr.models.ctc_models import EncDecCTCModel
+from nemo.collections.asr.models.av_ctc_models import AV_EncDecCTCModel
+from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
 from nemo.collections.asr.parts.mixins import ASRBPEMixin
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCBPEDecoding, CTCBPEDecodingConfig
 from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler
@@ -31,10 +33,10 @@
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging, model_utils
 
-__all__ = ['EncDecCTCModelBPE']
+__all__ = ['AV_EncDecCTCModelBPE']
 
 
-class EncDecCTCModelBPE(EncDecCTCModel, ASRBPEMixin):
+class AV_EncDecCTCModelBPE(AV_EncDecCTCModel, ASRBPEMixin):
     """Encoder decoder CTC-based models with Byte Pair Encoding."""
 
     def __init__(self, cfg: DictConfig, trainer=None):
@@ -50,7 +52,7 @@ def __init__(self, cfg: DictConfig, trainer=None):
 
         # Initialize a dummy vocabulary
         vocabulary = self.tokenizer.tokenizer.get_vocab()
-
+        self.labelled_manifest = cfg.labelled_manifest
         # Set the new vocabulary
         with open_dict(cfg):
             # sidestepping the potential overlapping tokens issue in aggregate tokenizers
@@ -72,6 +74,7 @@ def __init__(self, cfg: DictConfig, trainer=None):
 
         super().__init__(cfg=cfg, trainer=trainer)
 
+
         # Setup decoding objects
         decoding_cfg = self.cfg.get('decoding', None)
 
@@ -84,23 +87,17 @@ def __init__(self, cfg: DictConfig, trainer=None):
         self.decoding = CTCBPEDecoding(self.cfg.decoding, tokenizer=self.tokenizer)
 
         # Setup metric with decoding strategy
-        self.wer = WER(
+        self.wer = AV_WER(
             decoding=self.decoding,
             use_cer=self._cfg.get('use_cer', False),
             dist_sync_on_step=True,
             log_prediction=self._cfg.get("log_prediction", False),
+            labelled_manifest=self.labelled_manifest,
         )
 
     def _setup_dataloader_from_config(self, config: Optional[Dict]):
-        # if config.get("use_lhotse"):
-        #     return get_lhotse_dataloader_from_config(
-        #         config,
-        #         global_rank=self.global_rank,
-        #         world_size=self.world_size,
-        #         dataset=LhotseSpeechToTextBpeDataset(tokenizer=self.tokenizer),
-        #     )
-
-        dataset = audio_to_text_dataset.get_audio_to_text_bpe_dataset_from_config(
+
+        dataset = audio_to_text_dataset.get_av_to_text_bpe_dataset_from_config(
             config=config,
             local_rank=self.local_rank,
             global_rank=self.global_rank,
@@ -112,9 +109,6 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
         if dataset is None:
             return None
 
-        if isinstance(dataset, AudioToBPEDALIDataset):
-            # DALI Dataset implements dataloader interface
-            return dataset
 
         shuffle = config['shuffle']
         if isinstance(dataset, torch.utils.data.IterableDataset):
@@ -288,11 +282,12 @@ def change_vocabulary(
 
         self.decoding = CTCBPEDecoding(decoding_cfg=decoding_cfg, tokenizer=self.tokenizer)
 
-        self.wer = WER(
+        self.wer = AV_WER(
             decoding=self.decoding,
             use_cer=self._cfg.get('use_cer', False),
             log_prediction=self._cfg.get("log_prediction", False),
             dist_sync_on_step=True,
+            labelled_manifest=self.labelled_manifest,
         )
 
         # Update config
@@ -324,11 +319,12 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig):
 
         self.decoding = CTCBPEDecoding(decoding_cfg=decoding_cfg, tokenizer=self.tokenizer,)
 
-        self.wer = WER(
+        self.wer = AV_WER(
             decoding=self.decoding,
             use_cer=self.wer.use_cer,
             log_prediction=self.wer.log_prediction,
             dist_sync_on_step=True,
+            labelled_manifest=self.labelled_manifest,
         )
 
         self.decoder.temperature = decoding_cfg.get('temperature', 1.0)
@@ -349,308 +345,4 @@ def list_available_models(cls) -> List[PretrainedModelInfo]:
         """
         results = []
 
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_citrinet_256",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_256/versions/1.0.0rc1/files/stt_en_citrinet_256.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_citrinet_512",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_512/versions/1.0.0rc1/files/stt_en_citrinet_512.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_citrinet_1024",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_1024/versions/1.0.0rc1/files/stt_en_citrinet_1024.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_citrinet_256_gamma_0_25",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256_gamma_0_25",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_256_gamma_0_25/versions/1.0.0/files/stt_en_citrinet_256_gamma_0_25.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_citrinet_512_gamma_0_25",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512_gamma_0_25",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_512_gamma_0_25/versions/1.0.0/files/stt_en_citrinet_512_gamma_0_25.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_citrinet_1024_gamma_0_25",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024_gamma_0_25",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_1024_gamma_0_25/versions/1.0.0/files/stt_en_citrinet_1024_gamma_0_25.nemo",
-        )
-
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_es_citrinet_512",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_citrinet_512",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_es_citrinet_512/versions/1.0.0/files/stt_es_citrinet_512.nemo",
-        )
-
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_de_citrinet_1024",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_citrinet_1024",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_de_citrinet_1024/versions/1.5.0/files/stt_de_citrinet_1024.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_fr_citrinet_1024_gamma_0_25",
-            description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_fr_citrinet_1024_gamma_0_25",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_citrinet_1024_gamma_0_25/versions/1.5/files/stt_fr_citrinet_1024_gamma_0_25.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_fr_no_hyphen_citrinet_1024_gamma_0_25",
-            description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_fr_citrinet_1024_gamma_0_25",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_citrinet_1024_gamma_0_25/versions/1.5/files/stt_fr_no_hyphen_citrinet_1024_gamma_0_25.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_es_citrinet_1024_gamma_0_25",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_citrinet_1024_gamma_0_25",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_es_citrinet_1024_gamma_0_25/versions/1.8.0/files/stt_es_citrinet_1024_gamma_0_25.nemo",
-        )
-
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_conformer_ctc_small",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_small/versions/1.6.0/files/stt_en_conformer_ctc_small.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_conformer_ctc_medium",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_medium/versions/1.6.0/files/stt_en_conformer_ctc_medium.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_conformer_ctc_large",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_large/versions/1.10.0/files/stt_en_conformer_ctc_large.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_conformer_ctc_xlarge",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_xlarge",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_xlarge/versions/1.10.0/files/stt_en_conformer_ctc_xlarge.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_squeezeformer_ctc_xsmall_ls",
-            description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_xsmall_ls",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_xsmall_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_xsmall_ls.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_squeezeformer_ctc_small_ls",
-            description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_small_ls",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_small_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_small_ls.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_squeezeformer_ctc_small_medium_ls",
-            description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_small_medium_ls",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_small_medium_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_small_medium_ls.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_squeezeformer_ctc_medium_ls",
-            description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_medium_ls",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_medium_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_medium_ls.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_squeezeformer_ctc_medium_large_ls",
-            description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_medium_large_ls",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_medium_large_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_medium_large_ls.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_squeezeformer_ctc_large_ls",
-            description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_squeezeformer_ctc_large_ls",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_squeezeformer_ctc_large_ls/versions/1.13.0/files/stt_en_squeezeformer_ctc_large_ls.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_conformer_ctc_small_ls",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small_ls",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_small_ls/versions/1.0.0/files/stt_en_conformer_ctc_small_ls.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_conformer_ctc_medium_ls",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium_ls",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_medium_ls/versions/1.0.0/files/stt_en_conformer_ctc_medium_ls.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_conformer_ctc_large_ls",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large_ls",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_large_ls/versions/1.0.0/files/stt_en_conformer_ctc_large_ls.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_fr_conformer_ctc_large",
-            description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_fr_conformer_ctc_large",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_conformer_ctc_large/versions/1.5.1/files/stt_fr_conformer_ctc_large.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_fr_no_hyphen_conformer_ctc_large",
-            description="For details about this model, please visit https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_fr_conformer_ctc_large",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_conformer_ctc_large/versions/1.5.1/files/stt_fr_no_hyphen_conformer_ctc_large.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_de_conformer_ctc_large",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_conformer_ctc_large",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_de_conformer_ctc_large/versions/1.5.0/files/stt_de_conformer_ctc_large.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_es_conformer_ctc_large",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_conformer_ctc_large",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_es_conformer_ctc_large/versions/1.8.0/files/stt_es_conformer_ctc_large.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_hi_conformer_ctc_medium",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hi_conformer_ctc_medium",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_hi_conformer_ctc_medium/versions/1.6.0/files/stt_hi_conformer_ctc_medium.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_mr_conformer_ctc_medium",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_mr_conformer_ctc_medium",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_mr_conformer_ctc_medium/versions/1.6.0/files/stt_mr_conformer_ctc_medium.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_enes_conformer_ctc_large",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_enes_conformer_ctc_large",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_enes_conformer_ctc_large/versions/1.0.0/files/stt_enes_conformer_ctc_large.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_ca_conformer_ctc_large",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ca_conformer_ctc_large",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_ca_conformer_ctc_large/versions/1.11.0/files/stt_ca_conformer_ctc_large.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_rw_conformer_ctc_large",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_rw_conformer_ctc_large",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_rw_conformer_ctc_large/versions/1.11.0/files/stt_rw_conformer_ctc_large.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_enes_conformer_ctc_large_codesw",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_enes_conformer_ctc_large_codesw",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_enes_conformer_ctc_large_codesw/versions/1.0.0/files/stt_enes_conformer_ctc_large_codesw.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_be_conformer_ctc_large",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_be_conformer_ctc_large",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_be_conformer_ctc_large/versions/1.12.0/files/stt_be_conformer_ctc_large.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_hr_conformer_ctc_large",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_hr_conformer_ctc_large",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_hr_conformer_ctc_large/versions/1.11.0/files/stt_hr_conformer_ctc_large.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_it_conformer_ctc_large",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_it_conformer_ctc_large",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_it_conformer_ctc_large/versions/1.13.0/files/stt_it_conformer_ctc_large.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_ru_conformer_ctc_large",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ru_conformer_ctc_large",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_ru_conformer_ctc_large/versions/1.13.0/files/stt_ru_conformer_ctc_large.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_eo_conformer_ctc_large",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_eo_conformer_ctc_large",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_eo_conformer_ctc_large/versions/1.14.0/files/stt_eo_conformer_ctc_large.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_fastconformer_ctc_large",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_large",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_ctc_large/versions/1.0.0/files/stt_en_fastconformer_ctc_large.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_fastconformer_ctc_large_ls",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_large_ls",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_ctc_large_ls/versions/1.0.0/files/stt_en_fastconformer_ctc_large_ls.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_fastconformer_ctc_xlarge",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_xlarge",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_ctc_xlarge/versions/1.20.0/files/stt_en_fastconformer_ctc_xlarge.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_fastconformer_ctc_xxlarge",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_xxlarge",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_ctc_xxlarge/versions/1.20.1/files/stt_en_fastconformer_ctc_xxlarge.nemo",
-        )
-        results.append(model)
-
         return results
diff --git a/nemo/collections/asr/models/av_ctc_models.py b/nemo/collections/asr/models/av_ctc_models.py
index 76d054ab0142..6edaa497b4b7 100644
--- a/nemo/collections/asr/models/av_ctc_models.py
+++ b/nemo/collections/asr/models/av_ctc_models.py
@@ -29,8 +29,9 @@
 # from nemo.collections.asr.data.audio_to_text_dali import AudioToCharDALIDataset, DALIOutputs
 # from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset
 from nemo.collections.asr.models.ctc_models import EncDecCTCModel
+from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
 from nemo.collections.asr.losses.ctc import CTCLoss
-from nemo.collections.asr.metrics.wer import WER
+from nemo.collections.asr.metrics.av_wer import AV_WER
 from nemo.collections.asr.models.asr_model import ASRModel, ExportableEncDecModel
 from nemo.collections.asr.parts.mixins import ASRModuleMixin, ASRTranscriptionMixin, InterCTCMixin, TranscribeConfig
 from nemo.collections.asr.parts.mixins.transcription import GenericTranscriptionType, TranscriptionReturnType
@@ -44,6 +45,11 @@
 from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, LogprobsType, NeuralType, SpectrogramType, ImageFeatureValue
 from nemo.utils import logging
 
+#ADAPTERS
+from nemo.core import adapter_mixins
+from nemo.collections.common.parts.adapter_modules import LinearAdapterConfig
+from nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module import MultiHeadAttentionAdapterConfig
+from nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module import RelPositionMultiHeadAttentionAdapterConfig
 __all__ = ['AV_EncDecCTCModel']
 
 
@@ -58,8 +64,28 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
             self.world_size = trainer.world_size
 
         super().__init__(cfg=cfg, trainer=trainer)
-
-        self.a_model = EncDecCTCModel.from_pretrained(cfg.a_model_name)
+        if "BPE:" in cfg.a_model_name:
+            a_model_cfg = EncDecCTCModelBPE.from_pretrained(cfg.a_model_name[4:], return_config=True)
+            a_model_cfg = self.update_model_config_to_support_adapter(a_model_cfg) # for adapters
+            self.a_model = EncDecCTCModelBPE.from_pretrained(cfg.a_model_name[4:], override_config_path=a_model_cfg)
+        else:
+            a_model_cfg = EncDecCTCModel.from_pretrained(cfg.a_model_name, return_config=True)
+            a_model_cfg = self.update_model_config_to_support_adapter(a_model_cfg)
+            self.a_model = EncDecCTCModel.from_pretrained(cfg.a_model_name, override_config_path=a_model_cfg)
+        
+        self.labelled_manifest = cfg.labelled_manifest
+        
+        
+        if cfg.adapters.linear_adapter.keep:
+            linear_adapter_cfg = LinearAdapterConfig(
+                in_features=self.a_model.encoder.d_model,
+                dim = cfg.adapters.linear_adapter.dim,
+                activation=cfg.adapters.linear_adapter.activation,
+                norm_position=cfg.adapters.linear_adapter.norm_position,
+                dropout=cfg.adapters.linear_adapter.dropout,
+            )
+            linear_adapter_name = cfg.adapters.linear_adapter.name
+            self.a_model.add_adapter(name=linear_adapter_name, cfg=linear_adapter_cfg)
         with open_dict(self._cfg):
             if "feat_in" not in self._cfg.decoder or (
                 not self._cfg.decoder.feat_in and hasattr(self.encoder, '_feat_out')
@@ -75,24 +101,23 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
                     )
                 )
                 cfg.decoder["num_classes"] = len(self.cfg.decoder.vocabulary)
+        assert not (self.cfg.use_pretrained_dec and self.cfg.use_video_modality), "Pretrained decoder is not supported for video modality"
 
         # initialize a transformer encoder and decoder
-        self.a_linear = torch.nn.Linear(in_features = self.a_model.encoder._feat_out, out_features = self.cfg.av_encoder.d_model)
-        self.v_linear = torch.nn.Linear(in_features = self.cfg.v_model.feat_dim, out_features = self.cfg.av_encoder.d_model)
-        self.av_enocder_layer = torch.nn.TransformerEncoderLayer(d_model = self.cfg.av_encoder.d_model, nhead = self.cfg.av_encoder.nhead, dropout = self.cfg.av_encoder.dropout, batch_first=True)
-        self.av_encoder = torch.nn.TransformerEncoder(self.av_enocder_layer, num_layers = self.cfg.av_encoder.num_layers)
+        if cfg.use_video_modality:
+            self.a_linear = torch.nn.Linear(in_features = self.a_model.encoder._feat_out, out_features = self.cfg.av_encoder.d_model)
+            self.v_linear = torch.nn.Linear(in_features = self.cfg.v_model.feat_dim, out_features = self.cfg.av_encoder.d_model)
+            self.av_enocder_layer = torch.nn.TransformerEncoderLayer(d_model = self.cfg.av_encoder.d_model, nhead = self.cfg.av_encoder.nhead, dropout = self.cfg.av_encoder.dropout, batch_first=True)
+            self.av_encoder = torch.nn.TransformerEncoder(self.av_enocder_layer, num_layers = self.cfg.av_encoder.num_layers)
         
-        # Modality embeddings
-        self.a_modal_embs = torch.nn.Embedding(1, self.cfg.av_encoder.d_model)
-        self.v_modal_embs = torch.nn.Embedding(1, self.cfg.av_encoder.d_model)
+            # Modality embeddings
+            self.a_modal_embs = torch.nn.Embedding(1, self.cfg.av_encoder.d_model)
+            self.v_modal_embs = torch.nn.Embedding(1, self.cfg.av_encoder.d_model)
         
-        # Trainable positional encodings
-        # self.a_pos_enc = torch.nn.Embedding(10000, self.cfg.decoder.feat_in)
-        # self.v_pos_enc = torch.nn.Embedding(10000, self.cfg.decoder.feat_in)
-        
-        # self.av_decoder_layer = torch.nn.TransformerDecoderLayer(d_model = self.cfg.av_decoder.d_model, nhead = self.cfg.av_decoder.nhead, dropout = self.cfg.av_decoder.dropout, batch_first=True)
-        # self.av_decoder = torch.nn.TransformerDecoder(self.av_decoder_layer, num_layers = self.cfg.av_decoder.num_layers)
-        # self.av_linear = torch.nn.Linear(in_features = self.cfg.av_decoder.d_model, out_features = len(self.a_model.decoder.vocabulary))
+            # Trainable positional encodings
+            self.a_pos_enc = torch.nn.Embedding(10000, self.cfg.av_encoder.d_model)
+            self.v_pos_enc = torch.nn.Embedding(10000, self.cfg.av_encoder.d_model)
+
         
         self.decoder = EncDecCTCModel.from_config_dict(self._cfg.decoder)
         self.loss = CTCLoss(
@@ -113,11 +138,12 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         self.decoding = CTCDecoding(self.cfg.decoding, vocabulary=OmegaConf.to_container(self.decoder.vocabulary))
 
         # Setup metric with decoding strategy
-        self.wer = WER(
+        self.wer = AV_WER(
             decoding=self.decoding,
             use_cer=self._cfg.get('use_cer', False),
             dist_sync_on_step=True,
             log_prediction=self._cfg.get("log_prediction", False),
+            labelled_manifest=self.labelled_manifest
         )
 
         # Setup optional Optimization flags
@@ -126,6 +152,15 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
         # setting up interCTC loss (from InterCTCMixin)
         self.setup_interctc(decoder_name='decoder', loss_name='loss', wer_name='wer')
 
+    def update_model_config_to_support_adapter(self, model_cfg):
+        with open_dict(model_cfg):
+            adapter_metadata = adapter_mixins.get_registered_adapter(model_cfg.encoder._target_)
+            if adapter_metadata is not None:
+                model_cfg.encoder._target_ = adapter_metadata.adapter_class_path
+        
+        print("Updated encoder _target_ model :", model_cfg.encoder._target_)
+        return model_cfg
+
     def transcribe(
         self,
         audio: Union[str, List[str], torch.Tensor, np.ndarray],
@@ -222,11 +257,12 @@ def change_vocabulary(self, new_vocabulary: List[str], decoding_cfg: Optional[Di
                 decoding_cfg=decoding_cfg, vocabulary=OmegaConf.to_container(self.decoder.vocabulary)
             )
 
-            self.wer = WER(
+            self.wer = AV_WER(
                 decoding=self.decoding,
                 use_cer=self._cfg.get('use_cer', False),
                 dist_sync_on_step=True,
                 log_prediction=self._cfg.get("log_prediction", False),
+                labelled_manifest=self.labelled_manifest
             )
 
             # Update config
@@ -266,11 +302,12 @@ def change_decoding_strategy(self, decoding_cfg: DictConfig):
             decoding_cfg=decoding_cfg, vocabulary=OmegaConf.to_container(self.decoder.vocabulary)
         )
 
-        self.wer = WER(
+        self.wer = AV_WER(
             decoding=self.decoding,
             use_cer=self.wer.use_cer,
             log_prediction=self.wer.log_prediction,
             dist_sync_on_step=True,
+            labelled_manifest=self.labelled_manifest
         )
 
         self.decoder.temperature = decoding_cfg.get('temperature', 1.0)
@@ -490,37 +527,55 @@ def forward(
             processed_signal = self.a_model.spec_augmentation(input_spec=processed_signal, length=processed_signal_length)
 
         encoder_output = self.a_model.encoder(audio_signal=processed_signal, length=processed_signal_length)
-        # B,C,T -> B,T,C
-        encoded = encoder_output[0].permute(0, 2, 1)
+        encoded = encoder_output[0]
         encoded_len = encoder_output[1]
-        a_encoded = self.a_linear(encoded)
-        v_encoded = self.v_linear(video_input_signal)
-        
-        # Add modality embeddings
-        B, T, C = a_encoded.size()
-        B, F, D = v_encoded.size()
-        assert C == D, "The audio and video features must have the same dimensionality"
-        
-        # Expand modality embeddings to match the dimensions of a_encoded and v_encoded
-        a_modal_emb_expanded = self.a_modal_embs.weight.expand(B, T, -1)  # Shape: (B, T, feat_in)
-        v_modal_emb_expanded = self.v_modal_embs.weight.expand(B, F, -1)  # Shape: (B, F, feat_in)
+        if self.cfg.use_video_modality and not self.cfg.use_pretrained_dec:
+            # B,C,T -> B,T,C
+            encoded = encoded.permute(0, 2, 1)
+            a_encoded = self.a_linear(encoded)
+            v_encoded = self.v_linear(video_input_signal)
         
-        a_encoded = a_encoded + a_modal_emb_expanded
-        v_encoded = v_encoded + v_modal_emb_expanded
+            # Add modality embeddings
+            B, T, C = a_encoded.size()
+            B, F, D = v_encoded.size()
+            assert C == D, "The audio and video features must have the same dimensionality"
+            
+            # Expand modality embeddings to match the dimensions of a_encoded and v_encoded
+            a_modal_emb_expanded = self.a_modal_embs.weight.expand(B, T, -1)  # Shape: (B, T, feat_in)
+            v_modal_emb_expanded = self.v_modal_embs.weight.expand(B, F, -1)  # Shape: (B, F, feat_in)
+            
+            a_encoded = a_encoded + a_modal_emb_expanded
+            v_encoded = v_encoded + v_modal_emb_expanded
+            
+            # Add positional encodings
+            a_pos_enc = self.a_pos_enc(torch.arange(T, device=a_encoded.device)).unsqueeze(0).expand(B, -1, -1)
+            v_pos_enc = self.v_pos_enc(torch.arange(F, device=v_encoded.device)).unsqueeze(0).expand(B, -1, -1)
+            
+            a_encoded = a_encoded + a_pos_enc
+            v_encoded = v_encoded + v_pos_enc
+            
+            # Concat and pass them through the transformer encoder
+            av_encoded = torch.cat((a_encoded, v_encoded), dim=1)
+            av_encoded = self.av_encoder(av_encoded)
+            
+            # remove the v_encoded tokens
+            av_encoded = av_encoded[:, :T, :]
+            
+            # B,T,C -> B,C,T
+            av_encoded = av_encoded.permute(0, 2, 1)
+            
+            # remove 
+            log_probs = self.decoder(encoder_output=av_encoded)
+            greedy_predictions = log_probs.argmax(dim=-1, keepdim=False)
+        elif (not self.cfg.use_video_modality) and (not self.cfg.use_pretrained_dec):
+            log_probs = self.decoder(encoder_output=encoded)
+            greedy_predictions = log_probs.argmax(dim=-1, keepdim=False)
+        elif (not self.cfg.use_video_modality) and self.cfg.use_pretrained_dec:
+            log_probs = self.a_model.decoder(encoder_output=encoded)
+            greedy_predictions = log_probs.argmax(dim=-1, keepdim=False)
+        elif self.cfg.use_video_modality and self.cfg.use_pretrained_dec:
+            raise ValueError("Pretrained decoder is not supported for video modality")
         
-        # Add positional encodings
-        # a_encoded = a_encoded + self.a_pos_enc.to(a_encoded.device).repeat(B, T, 1)
-        # v_encoded = v_encoded + self.v_pos_enc.to(v_encoded.device).repeat(B, F, 1)
-        
-        # Concat and pass them through the transformer encoder
-        av_encoded = torch.cat((a_encoded, v_encoded), dim=1)
-        av_encoded = self.av_encoder(av_encoded)
-        
-        # B,T,C -> B,C,T
-        av_encoded = av_encoded.permute(0, 2, 1)
-        log_probs = self.decoder(encoder_output=av_encoded)
-        greedy_predictions = log_probs.argmax(dim=-1, keepdim=False)
-
         return (
             log_probs,
             encoded_len,
@@ -579,9 +634,22 @@ def training_step(self, batch, batch_nb):
                 targets_lengths=transcript_len,
                 predictions_lengths=encoded_len,
             )
-            wer, _, _ = self.wer.compute()
+            # wer, _, _ = self.wer.compute()
+            labelled_wer, unlabelled_wer, acc, scores_unlabelled, words_unlabelled = self.wer.compute()
             self.wer.reset()
-            tensorboard_logs.update({'training_batch_wer': wer})
+            # tensorboard_logs.update({'training_batch_l_wer': labelled_wer,
+            #                          'training_batch_u_wer': unlabelled_wer,
+            #                          'training_batch_l_acc': acc,
+            #                          })
+            if labelled_wer is not None:
+                tensorboard_logs.update({'train_l_wer': labelled_wer}) 
+                self.log('train_l_wer', labelled_wer, on_step=True, on_epoch=False)
+            if unlabelled_wer is not None:
+                tensorboard_logs.update({'train_u_wer': unlabelled_wer})
+                self.log('train_u_wer', unlabelled_wer, on_step=True, on_epoch=False)
+            if acc is not None:
+                tensorboard_logs.update({'train_acc': acc})
+                self.log('train_acc', acc, on_step=True, on_epoch=False)
 
         return {'loss': loss_value, 'log': tensorboard_logs}
 
@@ -623,11 +691,20 @@ def validation_pass(self, batch, batch_idx, dataloader_idx=0):
         self.wer.update(
             predictions=log_probs, targets=transcript, targets_lengths=transcript_len, predictions_lengths=encoded_len,
         )
-        wer, wer_num, wer_denom = self.wer.compute()
+        # wer, wer_num, wer_denom = self.wer.compute()
+        labelled_wer, unlabelled_wer, acc, scores_unlabelled, words_unlabelled = self.wer.compute()
         self.wer.reset()
-        metrics.update({'val_loss': loss_value, 'val_wer_num': wer_num, 'val_wer_denom': wer_denom, 'val_wer': wer})
-
-        self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32))
+        metrics.update({'val_loss': loss_value, 'val_labelled_wer': labelled_wer, 'val_unlabelled_wer': unlabelled_wer, 'val_acc': acc, 'val_wer_num': scores_unlabelled, 'val_wer_denom': words_unlabelled})
+        
+        # self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32))
+        if labelled_wer is not None:
+            self.log('val_l_wer', labelled_wer, on_epoch=True, sync_dist=True)
+        if unlabelled_wer is not None:
+            self.log('val_u_wer', unlabelled_wer, on_epoch=True, sync_dist=True)
+        if acc is not None:
+            self.log('val_acc', acc, on_epoch=True, sync_dist=True)
+        self.log('val_loss', loss_value, sync_dist=True)
+        
 
         # Reset access registry
         if AccessMixin.is_access_enabled(self.model_guid):
@@ -775,105 +852,6 @@ def list_available_models(cls) -> List[PretrainedModelInfo]:
         """
         results = []
 
-        model = PretrainedModelInfo(
-            pretrained_model_name="QuartzNet15x5Base-En",
-            description="QuartzNet15x5 model trained on six datasets: LibriSpeech, Mozilla Common Voice (validated clips from en_1488h_2019-12-10), WSJ, Fisher, Switchboard, and NSC Singapore English. It was trained with Apex/Amp optimization level O1 for 600 epochs. The model achieves a WER of 3.79% on LibriSpeech dev-clean, and a WER of 10.05% on dev-other. Please visit https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels for further details.",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemospeechmodels/versions/1.0.0a5/files/QuartzNet15x5Base-En.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_quartznet15x5",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_quartznet15x5",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_quartznet15x5/versions/1.0.0rc1/files/stt_en_quartznet15x5.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_en_jasper10x5dr",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_jasper10x5dr",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_jasper10x5dr/versions/1.0.0rc1/files/stt_en_jasper10x5dr.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_ca_quartznet15x5",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ca_quartznet15x5",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_ca_quartznet15x5/versions/1.0.0rc1/files/stt_ca_quartznet15x5.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_it_quartznet15x5",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_it_quartznet15x5",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_it_quartznet15x5/versions/1.0.0rc1/files/stt_it_quartznet15x5.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_fr_quartznet15x5",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_quartznet15x5",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_fr_quartznet15x5/versions/1.0.0rc1/files/stt_fr_quartznet15x5.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_es_quartznet15x5",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_quartznet15x5",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_es_quartznet15x5/versions/1.0.0rc1/files/stt_es_quartznet15x5.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_de_quartznet15x5",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_quartznet15x5",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_de_quartznet15x5/versions/1.0.0rc1/files/stt_de_quartznet15x5.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_pl_quartznet15x5",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_pl_quartznet15x5",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_pl_quartznet15x5/versions/1.0.0rc1/files/stt_pl_quartznet15x5.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_ru_quartznet15x5",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ru_quartznet15x5",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_ru_quartznet15x5/versions/1.0.0rc1/files/stt_ru_quartznet15x5.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_zh_citrinet_512",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_512",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_zh_citrinet_512/versions/1.0.0rc1/files/stt_zh_citrinet_512.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_zh_citrinet_1024_gamma_0_25",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_1024_gamma_0_25",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_zh_citrinet_1024_gamma_0_25/versions/1.0.0/files/stt_zh_citrinet_1024_gamma_0_25.nemo",
-        )
-
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_zh_citrinet_1024_gamma_0_25",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_1024_gamma_0_25",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_zh_citrinet_1024_gamma_0_25/versions/1.0.0/files/stt_zh_citrinet_1024_gamma_0_25.nemo",
-        )
-        results.append(model)
-
-        model = PretrainedModelInfo(
-            pretrained_model_name="asr_talknet_aligner",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:asr_talknet_aligner",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/asr_talknet_aligner/versions/1.0.0rc1/files/qn5x5_libri_tts_phonemes.nemo",
-        )
-        results.append(model)
-
         return results
 
     @property
diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py
index 4df02b1177cd..3887a92da648 100644
--- a/nemo/collections/asr/models/ctc_models.py
+++ b/nemo/collections/asr/models/ctc_models.py
@@ -65,7 +65,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
                 not self._cfg.decoder.feat_in and hasattr(self.encoder, '_feat_out')
             ):
                 self._cfg.decoder.feat_in = self.encoder._feat_out
-            if "feat_in" not in self._cfg.decoder or not self._cfg.decoder.feat_in:
                 raise ValueError("param feat_in of the decoder's config is not set!")
 
             if self.cfg.decoder.num_classes < 1 and self.cfg.decoder.vocabulary is not None:
diff --git a/nemo/collections/common/parts/preprocessing/collections.py b/nemo/collections/common/parts/preprocessing/collections.py
index f92a12cd6dde..fca016b477fe 100644
--- a/nemo/collections/common/parts/preprocessing/collections.py
+++ b/nemo/collections/common/parts/preprocessing/collections.py
@@ -209,15 +209,17 @@ class AVText(_Collection):
 
     AV_OUTPUT_TYPE = collections.namedtuple(
         typename='AVTextEntity',
-        field_names='id audio_file video_featfile duration text_tokens offset text_raw speaker orig_sr lang',
+        field_names='id audio_file video_file video_featfile duration text_tokens snr offset text_raw speaker orig_sr lang',
     )
 
     def __init__(
         self,
         ids: List[int],
         audio_files: List[str],
+        video_files: List[str],
         video_featfiles: List[str],
         durations: List[float],
+        snr_ratios: List[float],
         texts: List[str],
         offsets: List[str],
         speakers: List[Optional[int]],
@@ -236,9 +238,11 @@ def __init__(
         Args:
             ids: List of examples positions.
             audio_files: List of audio files.
+            video_files: List of video files.
             video_featfiles: List of video feature files.
             durations: List of float durations.
             texts: List of raw text transcripts.
+            snr_ratios: List of signal-to-noise ratios.
             offsets: List of duration offsets or None.
             speakers: List of optional speakers ids.
             orig_sampling_rates: List of original sampling rates of audio files.
@@ -256,8 +260,8 @@ def __init__(
         if index_by_file_id:
             self.mapping = {}
 
-        for id_, audio_file, video_featfile, duration, offset, text, speaker, orig_sr, token_labels, lang in zip(
-            ids, audio_files, video_featfiles, durations, offsets, texts, speakers, orig_sampling_rates, token_labels, langs
+        for id_, audio_file, video_file, video_featfile, duration, offset, text, snr_ratio, speaker, orig_sr, token_labels, lang in zip(
+            ids, audio_files, video_files, video_featfiles, durations, offsets, texts, snr_ratios, speakers, orig_sampling_rates, token_labels, langs
         ):
             # Duration filters.
             if min_duration is not None and duration < min_duration:
@@ -295,8 +299,8 @@ def __init__(
 
             total_duration += duration
 
-            data.append(output_type(id_, audio_file, video_featfile, duration,
-                        text_tokens, offset, text, speaker, orig_sr, lang))
+            data.append(output_type(id_, audio_file, video_file, video_featfile, duration,
+                        text_tokens, snr_ratio, offset, text, speaker, orig_sr, lang))
             if index_by_file_id:
                 file_id, _ = os.path.splitext(os.path.basename(audio_file))
                 if file_id not in self.mapping:
@@ -480,11 +484,13 @@ def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs):
         Args:
             manifests_files: Either single string file or list of such -
                 manifests to yield items from.
-            *args: Args to pass to `AudioText` constructor.
-            **kwargs: Kwargs to pass to `AudioText` constructor.
+            *args: Args to pass to `AVText` constructor.
+            **kwargs: Kwargs to pass to `AVText` constructor.
         """
 
-        ids, audio_files, durations, texts, offsets, video_featfiles = (
+        ids, audio_files, video_files, durations, texts, offsets, video_featfiles, snr_ratios = (
+            [],
+            [],
             [],
             [],
             [],
@@ -493,19 +499,21 @@ def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs):
             [],
         )
         speakers, orig_srs, token_labels, langs = [], [], [], []
-        for item in manifest.item_iter(manifests_files):
+        for item in manifest.av_item_iter(manifests_files):
             ids.append(item['id'])
             audio_files.append(item['audio_file'])
             video_featfiles.append(item['feature_file'])
             durations.append(item['duration'])
             texts.append(item['text'])
+            video_files.append(item['video_file'])
+            snr_ratios.append(item['snr_ratio'])
             offsets.append(item['offset'])
             speakers.append(item['speaker'])
             orig_srs.append(item['orig_sr'])
             token_labels.append(item['token_labels'])
             langs.append(item['lang'])
         super().__init__(
-            ids, audio_files, video_featfiles, durations, texts, offsets, speakers, orig_srs, token_labels, langs, *args, **kwargs
+            ids, audio_files, video_files, video_featfiles, durations, snr_ratios, texts, offsets, speakers, orig_srs, token_labels, langs, *args, **kwargs
         )
 
 
diff --git a/nemo/collections/common/parts/preprocessing/manifest.py b/nemo/collections/common/parts/preprocessing/manifest.py
index 1d49bd7c7019..773bbb13e90b 100644
--- a/nemo/collections/common/parts/preprocessing/manifest.py
+++ b/nemo/collections/common/parts/preprocessing/manifest.py
@@ -188,6 +188,165 @@ def __parse_item(line: str, manifest_file: str) -> Dict[str, Any]:
     return item
 
 
+def av_item_iter(
+    manifests_files: Union[str, List[str]], parse_func: Callable[[str, Optional[str]], Dict[str, Any]] = None
+) -> Iterator[Dict[str, Any]]:
+    """Iterate through json lines of provided manifests.
+
+    NeMo ASR pipelines often assume certain manifest files structure. In
+    particular, each manifest file should consist of line-per-sample files with
+    each line being correct json dict. Each such json dict should have a field
+    for audio file string, a field for duration float and a field for text
+    string. Offset also could be additional field and is set to None by
+    default.
+
+    Args:
+        manifests_files: Either single string file or list of such -
+            manifests to yield items from.
+
+        parse_func: A callable function which accepts as input a single line
+            of a manifest and optionally the manifest file itself,
+            and parses it, returning a dictionary mapping from str -> Any.
+
+    Yields:
+        Parsed key to value item dicts.
+
+    Raises:
+        ValueError: If met invalid json line structure.
+    """
+
+    if isinstance(manifests_files, str):
+        manifests_files = [manifests_files]
+
+    if parse_func is None:
+        parse_func = __av_parse_item
+
+    errors = defaultdict(list)
+    k = -1
+    logging.debug('Manifest files: %s', str(manifests_files))
+    for manifest_file in manifests_files:
+        logging.debug('Using manifest file: %s', str(manifest_file))
+        cached_manifest_file = DataStoreObject(manifest_file).get()
+        logging.debug('Cached at: %s', str(cached_manifest_file))
+        with open(expanduser(cached_manifest_file), 'r') as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                k += 1
+                try:
+                    item = parse_func(line, manifest_file)
+                except json.JSONDecodeError:
+                    errors[str(manifest_file)].append(line)
+                    continue
+                item['id'] = k
+
+                yield item
+
+    if len(errors) > 0:
+        for filename, lines in errors.items():
+            logging.error("=============================================")
+            logging.error(f"Failed to parse {len(lines)} lines from manifest file: {filename}")
+            for line in lines:
+                logging.error(f"-- Failed to parse line: `{line}`")
+        raise RuntimeError("Failed to parse some lines from manifest files. See logs for more details.")
+
+
+def __av_parse_item(line: str, manifest_file: str) -> Dict[str, Any]:
+    item = json.loads(line)
+
+    # Audio file
+    if 'audio_filename' in item:
+        item['audio_file'] = item.pop('audio_filename')
+    elif 'audio_filepath' in item:
+        item['audio_file'] = item.pop('audio_filepath')
+
+    # Video File
+    if 'video_filename' in item:
+        item['video_file'] = item.pop('video_filename')
+    elif 'video_filepath' in item:
+        item['video_file'] = item.pop('video_filepath')
+
+    if 'video_file' not in item and 'audio_file' not in item:
+        raise ValueError(
+            f"Manifest file {manifest_file} has invalid json line structure: {line} without proper audio/video file key."
+        )
+
+    # If the audio/video path is a relative path and does not exist,
+    # try to attach the parent directory of manifest to the audio path.
+    # Revert to the original path if the new path still doesn't exist.
+    # Assume that the audio path is like "wavs/xxxxxx.wav".
+    if 'audio_file' in item:
+        item['audio_file'] = get_full_path(audio_file=item['audio_file'], manifest_file=manifest_file)
+    if 'video_file' in item:
+        item['video_file'] = get_full_path(audio_file=item['video_file'], manifest_file=manifest_file)
+
+    # Duration.
+    if 'duration' not in item:
+        raise ValueError(
+            f"Manifest file {manifest_file} has invalid json line structure: {line} without proper duration key."
+        )
+
+    # Text.
+    if 'text' in item:
+        pass
+    elif 'text_filepath' in item:
+        with open(item.pop('text_filepath'), 'r') as f:
+            item['text'] = f.read().replace('\n', '')
+    elif 'normalized_text' in item:
+        item['text'] = item['normalized_text']
+    else:
+        item['text'] = ""
+
+    # Optional RTTM file
+    if 'rttm_file' in item:
+        pass
+    elif 'rttm_filename' in item:
+        item['rttm_file'] = item.pop('rttm_filename')
+    elif 'rttm_filepath' in item:
+        item['rttm_file'] = item.pop('rttm_filepath')
+    else:
+        item['rttm_file'] = None
+    if item['rttm_file'] is not None:
+        item['rttm_file'] = get_full_path(audio_file=item['rttm_file'], manifest_file=manifest_file)
+
+    # Optional audio feature file
+    if 'feature_file' in item:
+        pass
+    elif 'feature_filename' in item:
+        item['feature_file'] = item.pop('feature_filename')
+    elif 'feature_filepath' in item:
+        item['feature_file'] = item.pop('feature_filepath')
+    else:
+        item['feature_file'] = None
+    if item['feature_file'] is not None:
+        item['feature_file'] = get_full_path(audio_file=item['feature_file'], manifest_file=manifest_file)
+
+    # Optional snr_ratio 
+    if 'snr' in item:
+        pass
+    elif 'snr_ratio' in item:
+        item['snr'] = item.pop('snr_ratio')
+    else:
+        item['snr'] = None
+        
+    item = dict(
+        audio_file=item.get('audio_file', None),
+        video_file=item.get('video_file', None),
+        duration=item['duration'],
+        text=item['text'],
+        rttm_file=item['rttm_file'],
+        feature_file=item['feature_file'],
+        offset=item.get('offset', None),
+        speaker=item.get('speaker', None),
+        orig_sr=item.get('orig_sample_rate', None),
+        token_labels=item.get('token_labels', None),
+        lang=item.get('lang', None),
+        snr_ratio=item.get('snr', None),
+    )
+    return item
+
+
 def is_tarred_dataset(audio_file: str, manifest_file: Optional[str] = None) -> bool:
     if "/" in audio_file or manifest_file is None:
         # audio files in a tarred dataset don't have `/` in their paths

From cc8da56d2d283c8dda96dddbbad32f85af9b3f09 Mon Sep 17 00:00:00 2001
From: kolubex <darurlakshmipathibalaji@gmail.com>
Date: Tue, 3 Sep 2024 06:11:32 -0400
Subject: [PATCH 4/4] added_head_for_label_prediction

---
 balu_codes/configs/c1.yaml                    |   8 +-
 balu_codes/configs/c10.yaml                   |   3 +
 balu_codes/configs/c11.yaml                   |  11 +-
 balu_codes/configs/c5.yaml                    |  11 +-
 balu_codes/configs/c9.yaml                    |   3 +
 .../ctc_model_QuartzNet15x5Base copy.yaml     |  28 +-
 balu_codes/ctc_model_QuartzNet15x5Base.yaml   | 265 ------
 .../model_config_from_transcribe_py copy.yaml | 259 ------
 .../model_config_from_transcribe_py.yaml      | 128 ++-
 balu_codes/train_av_asr.py                    |  30 +-
 balu_codes/train_av_asr.sh                    |   4 +-
 nemo/collections/asr/data/av_to_text.py       |  42 +-
 .../asr/models/av_ctc_bpe_models.py           |   1 +
 nemo/collections/asr/models/av_ctc_models.py  |  64 +-
 .../common/parts/preprocessing/collections.py |  22 +-
 .../common/parts/preprocessing/manifest.py    |   7 +-
 .../asr/ASR_CTC_Language_Finetuning.ipynb     | 797 ++++++++++++++----
 .../asr/asr_adapters/ASR_with_Adapters.ipynb  | 178 ++--
 18 files changed, 957 insertions(+), 904 deletions(-)
 delete mode 100644 balu_codes/ctc_model_QuartzNet15x5Base.yaml
 delete mode 100644 balu_codes/model_config_from_transcribe_py copy.yaml

diff --git a/balu_codes/configs/c1.yaml b/balu_codes/configs/c1.yaml
index c90d9447139f..827ee6269274 100644
--- a/balu_codes/configs/c1.yaml
+++ b/balu_codes/configs/c1.yaml
@@ -8,7 +8,7 @@ exp_dir: /tmp/bld56_dataset_v1/tmp/
 wandb:
   run_name: au_pdec_uman_stok
   project: NEMO_TEST
-  create_wandb_logger: true
+  create_wandb_logger: false
   log_model: false
 use_video_modality: false
 use_pretrained_dec: true
@@ -30,7 +30,7 @@ train_ds:
   tarred_audio_filepaths: null
   shuffle_n: 2048
   bucketing_strategy: synced_randomized
-  override_snr_ratio: 0.7
+  override_snr_ratio: 0.5
   bucketing_batch_size:
   - 34
   - 30
@@ -50,7 +50,7 @@ validation_ds:
   shuffle: false
   num_workers: 10
   pin_memory: true
-  override_snr_ratio: 0.7
+  override_snr_ratio: 0.5
   use_start_end_token: false
 test_ds:
   manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval_no_label.json
@@ -62,7 +62,7 @@ test_ds:
   shuffle: false
   num_workers: 10
   pin_memory: true
-  override_snr_ratio: 0.7
+  override_snr_ratio: 0.5
   use_start_end_token: false
 tokenizer:
   dir: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/
diff --git a/balu_codes/configs/c10.yaml b/balu_codes/configs/c10.yaml
index 2d43b50fbb79..38db1de93c87 100644
--- a/balu_codes/configs/c10.yaml
+++ b/balu_codes/configs/c10.yaml
@@ -12,6 +12,9 @@ wandb:
   log_model: false
 use_video_modality: true
 use_pretrained_dec: false
+label_pred_head:
+  keep: false
+  num_classes: 44
 train_ds:
   manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_train_no_label.json
   video_frame_rate: 5
diff --git a/balu_codes/configs/c11.yaml b/balu_codes/configs/c11.yaml
index 485ac0cc6fc0..c548ec98e27e 100644
--- a/balu_codes/configs/c11.yaml
+++ b/balu_codes/configs/c11.yaml
@@ -3,7 +3,7 @@ log_prediction: true
 ctc_reduction: mean_batch
 skip_nan_grad: false
 a_model_name: BPE:stt_en_conformer_ctc_large
-labelled_manifest: true
+labelled_manifest: false
 exp_dir: /tmp/bld56_dataset_v1/tmp/
 wandb:
   run_name: au_ndec_lman_ntok
@@ -12,8 +12,11 @@ wandb:
   log_model: false
 use_video_modality: true
 use_pretrained_dec: false
+label_pred_head:
+  keep: true
+  num_classes: 44
 train_ds:
-  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_train.json
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_train_no_label.json
   video_frame_rate: 5
   get_vid_feats: true
   get_zero_vid_feats: true
@@ -41,7 +44,7 @@ train_ds:
   - 12
   - 8
 validation_ds:
-  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval.json
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval_no_label.json
   video_frame_rate: 5
   get_vid_feats: true
   get_zero_vid_feats: true
@@ -53,7 +56,7 @@ validation_ds:
   override_snr_ratio: 0.5
   use_start_end_token: false
 test_ds:
-  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_test.json
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_test_no_label.json
   video_frame_rate: 5
   get_vid_feats: true
   get_zero_vid_feats: true
diff --git a/balu_codes/configs/c5.yaml b/balu_codes/configs/c5.yaml
index 41c4f585431c..ace661ec92dd 100644
--- a/balu_codes/configs/c5.yaml
+++ b/balu_codes/configs/c5.yaml
@@ -3,7 +3,7 @@ log_prediction: true
 ctc_reduction: mean_batch
 skip_nan_grad: false
 a_model_name: BPE:stt_en_conformer_ctc_large
-labelled_manifest: true
+labelled_manifest: false
 exp_dir: /tmp/bld56_dataset_v1/tmp/
 wandb:
   run_name: av_ndec_lman_ntok
@@ -12,8 +12,11 @@ wandb:
   log_model: false
 use_video_modality: true
 use_pretrained_dec: false
+label_pred_head:
+  keep: true
+  num_classes: 44
 train_ds:
-  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_train.json
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_train_no_label.json
   video_frame_rate: 5
   get_vid_feats: true
   get_zero_vid_feats: false
@@ -41,7 +44,7 @@ train_ds:
   - 12
   - 8
 validation_ds:
-  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval.json
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval_no_label.json
   video_frame_rate: 5
   get_vid_feats: true
   get_zero_vid_feats: false
@@ -53,7 +56,7 @@ validation_ds:
   override_snr_ratio: 0.5
   use_start_end_token: false
 test_ds:
-  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval.json
+  manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_eval_no_label.json
   video_frame_rate: 5
   get_vid_feats: true
   get_zero_vid_feats: false
diff --git a/balu_codes/configs/c9.yaml b/balu_codes/configs/c9.yaml
index 5de581c9429b..87f1a1697796 100644
--- a/balu_codes/configs/c9.yaml
+++ b/balu_codes/configs/c9.yaml
@@ -12,6 +12,9 @@ wandb:
   log_model: false
 use_video_modality: true
 use_pretrained_dec: false
+label_pred_head:
+  keep: false
+  num_classes: 44
 train_ds:
   manifest_filepath: /tmp/bld56_dataset_v1/it2/annotations/manifest_train_no_label.json
   video_frame_rate: 5
diff --git a/balu_codes/ctc_model_QuartzNet15x5Base copy.yaml b/balu_codes/ctc_model_QuartzNet15x5Base copy.yaml
index 4c40015a224d..364e6a337e23 100644
--- a/balu_codes/ctc_model_QuartzNet15x5Base copy.yaml	
+++ b/balu_codes/ctc_model_QuartzNet15x5Base copy.yaml	
@@ -18,7 +18,7 @@ spec_augment:
     rect_masks: 5
     rect_time: 120
 
-a_model_name: QuartzNet15x5Base-En
+a_model_name: QuartzNet15x5Base-En # CHANGE
 sample_rate: 16000
 labels: 
   - ' '
@@ -50,20 +50,20 @@ labels:
   - z
   - ''''
 train_ds:
-  manifest_filepath: /disk1/it1/annotations/manifest_train.json
-  video_frame_rate: 5
-  # - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/tarred_audio_manifest.json #TBD
+  manifest_filepath: /disk1/it1/annotations/manifest_train.json # CHANGE
+  video_frame_rate: 5 # CHANGE
+  # - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/tarred_audio_manifest.json #TBD # CHANGE
   sample_rate: 16000
-  batch_size: 1
+  batch_size: 1 # CHANGE
   shuffle: true
-  num_workers: 0
+  num_workers: 0 # CHANGE
   pin_memory: true
   use_start_end_token: false
   trim_silence: false
-  max_duration: 20.0
+  max_duration: 20.0 # CHANGE
   min_duration: 0.1
   is_tarred: false
-  tarred_audio_filepaths: null
+  tarred_audio_filepaths: null # CHANGE
   shuffle_n: 2048
   bucketing_strategy: synced_randomized
   bucketing_batch_size:
@@ -77,9 +77,9 @@ train_ds:
   - 8
 
 validation_ds:
-  manifest_filepath: /disk1/it1/annotations/manifest_train.json
-  video_frame_rate: 5
-  # - /manifests/librispeech/librivox-dev-other.json #TBD
+  manifest_filepath: /disk1/it1/annotations/manifest_train.json # CHANGE
+  video_frame_rate: 5 # CHANGE
+  # - /manifests/librispeech/librivox-dev-other.json #TBD # CHANGE
   sample_rate: 16000
   batch_size: 1
   shuffle: false
@@ -96,13 +96,13 @@ test_ds:
   pin_memory: true
   use_start_end_token: false
 
-av_encoder:
+av_encoder: # CHANGE
   d_model: 512
   nhead: 4
   num_layers: 2
   dropout: 0.1
 
-v_model:
+v_model: # CHANGE
   feat_dim: 512
 
 decoder:
@@ -146,4 +146,4 @@ optim:
   - 0.8
   - 0.5
   weight_decay: 0.001
-target: nemo.collections.asr.models.av_ctc_bpe_models.AV_EncDecCTCModelBPE
+target: nemo.collections.asr.models.av_ctc_bpe_models.AV_EncDecCTCModel
diff --git a/balu_codes/ctc_model_QuartzNet15x5Base.yaml b/balu_codes/ctc_model_QuartzNet15x5Base.yaml
deleted file mode 100644
index d48014222f13..000000000000
--- a/balu_codes/ctc_model_QuartzNet15x5Base.yaml
+++ /dev/null
@@ -1,265 +0,0 @@
-preprocessor:
-  cls: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
-  params:
-    normalize: per_feature
-    window_size: 0.02
-    sample_rate: 16000
-    window_stride: 0.01
-    window: hann
-    features: 64
-    n_fft: 512
-    frame_splicing: 1
-    dither: 1.0e-05
-    stft_conv: false
-spec_augment:
-  cls: nemo.collections.asr.modules.SpectrogramAugmentation
-  params:
-    rect_freq: 50
-    rect_masks: 5
-    rect_time: 120
-encoder:
-  cls: nemo.collections.asr.modules.ConvASREncoder
-  params:
-    feat_in: 64
-    activation: relu
-    conv_mask: true
-    jasper:
-    - filters: 256
-      repeat: 1
-      kernel:
-      - 33
-      stride:
-      - 2
-      dilation:
-      - 1
-      dropout: 0.0
-      residual: false
-      separable: true
-    - filters: 256
-      repeat: 5
-      kernel:
-      - 33
-      stride:
-      - 1
-      dilation:
-      - 1
-      dropout: 0.0
-      residual: true
-      separable: true
-    - filters: 256
-      repeat: 5
-      kernel:
-      - 33
-      stride:
-      - 1
-      dilation:
-      - 1
-      dropout: 0.0
-      residual: true
-      separable: true
-    - filters: 256
-      repeat: 5
-      kernel:
-      - 33
-      stride:
-      - 1
-      dilation:
-      - 1
-      dropout: 0.0
-      residual: true
-      separable: true
-    - filters: 256
-      repeat: 5
-      kernel:
-      - 39
-      stride:
-      - 1
-      dilation:
-      - 1
-      dropout: 0.0
-      residual: true
-      separable: true
-    - filters: 256
-      repeat: 5
-      kernel:
-      - 39
-      stride:
-      - 1
-      dilation:
-      - 1
-      dropout: 0.0
-      residual: true
-      separable: true
-    - filters: 256
-      repeat: 5
-      kernel:
-      - 39
-      stride:
-      - 1
-      dilation:
-      - 1
-      dropout: 0.0
-      residual: true
-      separable: true
-    - filters: 512
-      repeat: 5
-      kernel:
-      - 51
-      stride:
-      - 1
-      dilation:
-      - 1
-      dropout: 0.0
-      residual: true
-      separable: true
-    - filters: 512
-      repeat: 5
-      kernel:
-      - 51
-      stride:
-      - 1
-      dilation:
-      - 1
-      dropout: 0.0
-      residual: true
-      separable: true
-    - filters: 512
-      repeat: 5
-      kernel:
-      - 51
-      stride:
-      - 1
-      dilation:
-      - 1
-      dropout: 0.0
-      residual: true
-      separable: true
-    - filters: 512
-      repeat: 5
-      kernel:
-      - 63
-      stride:
-      - 1
-      dilation:
-      - 1
-      dropout: 0.0
-      residual: true
-      separable: true
-    - filters: 512
-      repeat: 5
-      kernel:
-      - 63
-      stride:
-      - 1
-      dilation:
-      - 1
-      dropout: 0.0
-      residual: true
-      separable: true
-    - filters: 512
-      repeat: 5
-      kernel:
-      - 63
-      stride:
-      - 1
-      dilation:
-      - 1
-      dropout: 0.0
-      residual: true
-      separable: true
-    - filters: 512
-      repeat: 5
-      kernel:
-      - 75
-      stride:
-      - 1
-      dilation:
-      - 1
-      dropout: 0.0
-      residual: true
-      separable: true
-    - filters: 512
-      repeat: 5
-      kernel:
-      - 75
-      stride:
-      - 1
-      dilation:
-      - 1
-      dropout: 0.0
-      residual: true
-      separable: true
-    - filters: 512
-      repeat: 5
-      kernel:
-      - 75
-      stride:
-      - 1
-      dilation:
-      - 1
-      dropout: 0.0
-      residual: true
-      separable: true
-    - filters: 512
-      repeat: 1
-      kernel:
-      - 87
-      stride:
-      - 1
-      dilation:
-      - 2
-      dropout: 0.0
-      residual: false
-      separable: true
-    - filters: 1024
-      repeat: 1
-      kernel:
-      - 1
-      stride:
-      - 1
-      dilation:
-      - 1
-      dropout: 0.0
-      residual: false
-decoder:
-  cls: nemo.collections.asr.modules.ConvASRDecoder
-  params:
-    feat_in: 1024
-    num_classes: 28
-    vocabulary:
-    - ' '
-    - a
-    - b
-    - c
-    - d
-    - e
-    - f
-    - g
-    - h
-    - i
-    - j
-    - k
-    - l
-    - m
-    - n
-    - o
-    - p
-    - q
-    - r
-    - s
-    - t
-    - u
-    - v
-    - w
-    - x
-    - y
-    - z
-    - ''''
-optim:
-  name: novograd
-  lr: 0.01
-  betas:
-  - 0.8
-  - 0.5
-  weight_decay: 0.001
-target: nemo.collections.asr.models.ctc_models.EncDecCTCModel
diff --git a/balu_codes/model_config_from_transcribe_py copy.yaml b/balu_codes/model_config_from_transcribe_py copy.yaml
deleted file mode 100644
index 1a511e019423..000000000000
--- a/balu_codes/model_config_from_transcribe_py copy.yaml	
+++ /dev/null
@@ -1,259 +0,0 @@
-sample_rate: 16000
-log_prediction: true
-ctc_reduction: mean_batch
-skip_nan_grad: false
-a_model_name: #TBD
-train_ds:
-  manifest_filepath:
-  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/tarred_audio_manifest.json #TBD
-  sample_rate: 16000
-  batch_size: 32
-  shuffle: true
-  num_workers: 4
-  pin_memory: true
-  use_start_end_token: false
-  trim_silence: false
-  max_duration: 10.0
-  min_duration: 0.1
-  is_tarred: false
-  tarred_audio_filepaths: null
-  shuffle_n: 2048
-  bucketing_strategy: synced_randomized
-  bucketing_batch_size:
-  - 34
-  - 30
-  - 26
-  - 22
-  - 18
-  - 16
-  - 12
-  - 8
-validation_ds:
-  manifest_filepath:
-  - /manifests/librispeech/librivox-dev-other.json #TBD
-  sample_rate: 16000
-  batch_size: 32
-  shuffle: false
-  num_workers: 8
-  pin_memory: true
-  use_start_end_token: false
-test_ds:
-  manifest_filepath:
-  - /manifests/librispeech/librivox-dev-other.json #TBD
-  sample_rate: 16000
-  batch_size: 32
-  shuffle: false
-  num_workers: 8
-  pin_memory: true
-  use_start_end_token: false
-tokenizer:
-  dir: /tokenizers/NeMo_ASR_SET/English/asr_set_3.0/tokenizer_spe_unigram_v128
-  type: bpe
-  model_path: nemo:e06949b0b85a485e9f280ea6d19e5492_tokenizer.model
-  vocab_path: nemo:53bbc634b62446de83525753e95a50ac_vocab.txt
-  spe_tokenizer_vocab: nemo:ff63e3c43c5f4b95bff702425366a4a6_tokenizer.vocab
-preprocessor:
-  _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
-  sample_rate: 16000
-  normalize: per_feature
-  window_size: 0.025
-  window_stride: 0.01
-  window: hann
-  features: 80
-  n_fft: 512
-  log: true
-  frame_splicing: 1
-  dither: 1.0e-05
-  pad_to: 0
-  pad_value: 0.0
-spec_augment:
-  _target_: nemo.collections.asr.modules.SpectrogramAugmentation
-  freq_masks: 2
-  time_masks: 10
-  freq_width: 27
-  time_width: 0.05
-# encoder:
-#   _target_: nemo.collections.asr.modules.ConformerEncoder
-#   feat_in: 80
-#   feat_out: -1
-#   n_layers: 18
-#   d_model: 512
-#   subsampling: striding
-#   subsampling_factor: 4
-#   subsampling_conv_channels: 512
-#   ff_expansion_factor: 4
-#   self_attention_model: rel_pos
-#   n_heads: 8
-#   att_context_size:
-#   - -1
-#   - -1
-#   xscaling: true
-#   untie_biases: true
-#   pos_emb_max_len: 5000
-#   conv_kernel_size: 31
-#   conv_norm_type: batch_norm
-#   dropout: 0.1
-#   dropout_emb: 0.0
-#   dropout_att: 0.1
-
-av_enocder:
-  d_model: 512
-  nhead: 4
-  num_layers: 2
-  dropout: 0.1
-
-adapters:
-  #TBD
-  
-decoder: # Keep it thse same as you are going by same decoder, note it has dimension.
-  _target_: nemo.collections.asr.modules.ConvASRDecoder
-  feat_in: 512
-  num_classes: 128
-  vocabulary:
-  - <unk>
-  - ▁
-  - s
-  - t
-  - e
-  - d
-  - o
-  - ▁the
-  - a
-  - i
-  - ▁a
-  - u
-  - 'y'
-  - m
-  - l
-  - 'n'
-  - p
-  - re
-  - c
-  - h
-  - r
-  - ▁s
-  - g
-  - ▁to
-  - er
-  - ing
-  - f
-  - ▁and
-  - an
-  - ▁i
-  - k
-  - ▁that
-  - ''''
-  - ▁of
-  - ▁in
-  - w
-  - ▁p
-  - ed
-  - or
-  - al
-  - ar
-  - ▁f
-  - en
-  - in
-  - b
-  - ▁you
-  - ▁w
-  - ▁b
-  - le
-  - ll
-  - es
-  - ▁it
-  - ve
-  - ur
-  - ▁we
-  - ▁re
-  - ▁be
-  - ly
-  - ▁is
-  - ▁he
-  - ▁o
-  - ▁c
-  - it
-  - ▁n
-  - ▁on
-  - un
-  - ▁t
-  - 'on'
-  - se
-  - th
-  - ce
-  - ▁do
-  - ic
-  - ▁for
-  - ▁th
-  - ion
-  - ch
-  - ▁was
-  - ri
-  - ent
-  - ▁g
-  - ver
-  - ▁co
-  - li
-  - ▁ha
-  - ▁ma
-  - la
-  - ro
-  - v
-  - us
-  - ▁ca
-  - ▁di
-  - ▁this
-  - ra
-  - ▁st
-  - ▁e
-  - ▁not
-  - ▁so
-  - ▁de
-  - ▁have
-  - ter
-  - ir
-  - ▁go
-  - ation
-  - ▁with
-  - ate
-  - ▁me
-  - ▁mo
-  - ment
-  - ▁con
-  - ▁but
-  - vi
-  - ▁pro
-  - ▁ho
-  - j
-  - ▁com
-  - ight
-  - ▁know
-  - ▁what
-  - ect
-  - ▁ex
-  - ▁some
-  - ▁would
-  - ▁like
-  - x
-  - ▁his
-  - q
-  - z
-optim:
-  name: adamw
-  lr: 2.0
-  betas:
-  - 0.9
-  - 0.98
-  weight_decay: 0.001
-  sched:
-    name: NoamAnnealing
-    d_model: 512
-    warmup_steps: 10000
-    warmup_ratio: null
-    min_lr: 1.0e-06
-compute_eval_loss: false
-variational_noise:
-  start_step: 0
-  std: 0.0
-target: nemo.collections.asr.models.av_ctc_bpe_models.AV_EncDecCTCModelBPE
-nemo_version: 1.9.0rc0
diff --git a/balu_codes/model_config_from_transcribe_py.yaml b/balu_codes/model_config_from_transcribe_py.yaml
index 4951a0f966a1..8e784965be43 100644
--- a/balu_codes/model_config_from_transcribe_py.yaml
+++ b/balu_codes/model_config_from_transcribe_py.yaml
@@ -1,38 +1,37 @@
 sample_rate: 16000
-log_prediction: true
+log_prediction: false
 ctc_reduction: mean_batch
 skip_nan_grad: false
+a_model_name: BPE:stt_en_conformer_ctc_large # CHANGE, BPE: is a must since, it is used to load audio encoder.
+labelled_manifest: False # CHANGE
+exp_dir: /tmp/bld56_dataset_v1/tmp/ # CHANGE
+wandb:
+  run_name: "snr_0.7_ada+df" # CHANGE
+  project: "NEMO_TEST" # CHANGE
+  create_wandb_logger: false # CHANGE
+  log_model: False # CHANGE
+
+use_video_modality: false # CHANGE
+use_pretrained_dec: true # CHANGE
 train_ds:
-  manifest_filepath:
-  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/tarred_audio_manifest.json
-  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket2/tarred_audio_manifest.json
-  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket3/tarred_audio_manifest.json
-  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket4/tarred_audio_manifest.json
-  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket5/tarred_audio_manifest.json
-  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket6/tarred_audio_manifest.json
-  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket7/tarred_audio_manifest.json
-  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket8/tarred_audio_manifest.json
+  manifest_filepath: /tmp/bld56_dataset_v1/it1_70/annotations/manifest_train_no_label.json # CHANGE
+  video_frame_rate: 5 # CHANGE
+  get_vid_feats: true # CHANGE, always keep it to true.
+  get_zero_vid_feats: false # CHANGE
   sample_rate: 16000
-  batch_size: 1
+  batch_size: 8 # CHANGE
   shuffle: true
-  num_workers: 4
+  num_workers: 12 # CHANGE
   pin_memory: true
   use_start_end_token: false
   trim_silence: false
-  max_duration: 10.0
+  max_duration: 20.0 # CHANGE
   min_duration: 0.1
-  is_tarred: true
-  tarred_audio_filepaths:
-  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/audio__OP_0..8191_CL_.tar
-  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket2/audio__OP_0..8191_CL_.tar
-  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket3/audio__OP_0..8191_CL_.tar
-  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket4/audio__OP_0..8191_CL_.tar
-  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket5/audio__OP_0..8191_CL_.tar
-  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket6/audio__OP_0..8191_CL_.tar
-  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket7/audio__OP_0..8191_CL_.tar
-  - - /data2/nemo_asr/nemo_asr_set_3.0/bucket8/audio__OP_0..8191_CL_.tar
+  is_tarred: false # CHANGE
+  tarred_audio_filepaths: null # CHANGE
   shuffle_n: 2048
   bucketing_strategy: synced_randomized
+  override_snr_ratio: 0.6 # CHANGE if float, then coniders as snr, if None then goes by manifest.
   bucketing_batch_size:
   - 34
   - 30
@@ -43,35 +42,47 @@ train_ds:
   - 12
   - 8
 validation_ds:
-  manifest_filepath:
-  - /manifests/librispeech/librivox-dev-other.json
-  - /manifests/librispeech/librivox-dev-clean.json
-  - /manifests/librispeech/librivox-test-other.json
-  - /manifests/librispeech/librivox-test-clean.json
+  manifest_filepath: /tmp/bld56_dataset_v1/it1_70/annotations/manifest_eval_no_label.json # CHANGE
+  video_frame_rate: 5 # CHANGE
+  get_vid_feats: true # CHANGE, always keep it to true.
+  get_zero_vid_feats: false # CHANGE
   sample_rate: 16000
   batch_size: 32
   shuffle: false
-  num_workers: 8
+  num_workers: 12
   pin_memory: true
+  override_snr_ratio: 0.6 # CHANGE if float, then coniders as snr, if None then goes by manifest.
   use_start_end_token: false
 test_ds:
-  manifest_filepath:
-  - /manifests/librispeech/librivox-dev-other.json
-  - /manifests/librispeech/librivox-dev-clean.json
-  - /manifests/librispeech/librivox-test-other.json
-  - /manifests/librispeech/librivox-test-clean.json
+  manifest_filepath: /tmp/bld56_dataset_v1/it1_70/annotations/manifest_eval_no_label.json # CHANGE
+  video_frame_rate: 5 # CHANGE
+  get_vid_feats: true # CHANGE, always keep it to true.
+  get_zero_vid_feats: false # CHANGE
   sample_rate: 16000
   batch_size: 32
   shuffle: false
-  num_workers: 8
+  num_workers: 12
   pin_memory: true
+  override_snr_ratio: 0.6 # CHANGE if float, then coniders as snr, if None then goes by manifest.
   use_start_end_token: false
-tokenizer:
-  dir: /tokenizers/NeMo_ASR_SET/English/asr_set_3.0/tokenizer_spe_unigram_v128
+
+# NEW TOKENIZER 
+# tokenizer: # CHANGE
+#   dir: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/
+#   type: bpe
+#   model_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.model
+#   vocab_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/vocab.txt
+#   spe_tokenizer_vocab: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/final_tokenizer/tokenizer.vocab
+
+
+# OLD TOKENIZER
+tokenizer: # CHANGE # CHANGE THE NUM CLASSES TO 128 in DEC
+  dir: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/
   type: bpe
-  model_path: nemo:e06949b0b85a485e9f280ea6d19e5492_tokenizer.model
-  vocab_path: nemo:53bbc634b62446de83525753e95a50ac_vocab.txt
-  spe_tokenizer_vocab: nemo:ff63e3c43c5f4b95bff702425366a4a6_tokenizer.vocab
+  model_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/tokenizer.model
+  vocab_path: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/vocab.txt
+  spe_tokenizer_vocab: /home/bld56/gsoc/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/init_toknizer/tokenizer.vocab
+
 preprocessor:
   _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
   sample_rate: 16000
@@ -92,6 +103,17 @@ spec_augment:
   time_masks: 10
   freq_width: 27
   time_width: 0.05
+
+av_encoder: # CHANGE
+  d_model: 512
+  nhead: 8
+  num_layers: 4
+  dropout: 0.1
+
+v_model: # CHANGE
+  feat_dim: 512
+
+
 encoder:
   _target_: nemo.collections.asr.modules.ConformerEncoder
   feat_in: 80
@@ -118,7 +140,7 @@ encoder:
 decoder:
   _target_: nemo.collections.asr.modules.ConvASRDecoder
   feat_in: 512
-  num_classes: 128
+  num_classes: 128 # CHANGE to 356 for new tok, else 128.
   vocabulary:
   - <unk>
   - ▁
@@ -258,12 +280,30 @@ optim:
   sched:
     name: NoamAnnealing
     d_model: 512
-    warmup_steps: 10000
+    warmup_steps: 2000
     warmup_ratio: null
     min_lr: 1.0e-06
-compute_eval_loss: false
+compute_eval_loss: false # CHANGE
+
+adapters: # CHANGE
+  linear_adapter:
+    keep: false
+    name: "AV_v1" #@param {type:"string"}
+    dim: 64 #@param {type:"integer"}
+    activation: "swish" #@param {type:"string"}
+    norm_position: "pre" #@param ["pre", "post"]
+    dropout: 0.1 #@param {type:"number"}
+  multi_head_attention_adapter: 
+    keep: false # TODO @Balu: Needs deeper understanding of config. ref: tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
+  rel_position_multi_head_attention_adapter:
+    keep: false # TODO @Balu: Needs deeper understanding of config. ref: tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
+
+  
+
+
+
 variational_noise:
   start_step: 0
   std: 0.0
-target: nemo.collections.asr.models.ctc_bpe_models.EncDecCTCModelBPE
+target: nemo.collections.asr.models.ctc_bpe_models.AV_EncDecCTCModelBPE
 nemo_version: 1.9.0rc0
diff --git a/balu_codes/train_av_asr.py b/balu_codes/train_av_asr.py
index bff0e3acbd6a..1270a2d5f7a0 100644
--- a/balu_codes/train_av_asr.py
+++ b/balu_codes/train_av_asr.py
@@ -19,6 +19,7 @@
 def load_and_configure_model(config_file_path):
     conf = OmegaConf.load(config_file_path)
     overrides = OmegaConf.from_cli()
+    print(overrides)
     updated_conf = OmegaConf.merge(conf, overrides)
     OmegaConf.set_struct(updated_conf, True)
     model = nemo_asr.models.AV_EncDecCTCModelBPE(updated_conf)
@@ -96,22 +97,41 @@ def setup_exp_manager(trainer, model):
         
     return logdir
 
+def selective_load(model, checkpoint_path):
+    checkpoint = torch.load(checkpoint_path)
+    state_dict = checkpoint['state_dict']
+
+    # Filter out unnecessary keys
+    model_state_dict = model.state_dict()
+    filtered_state_dict = {k: v for k, v in state_dict.items() if k in model_state_dict and v.size() == model_state_dict[k].size()}
+
+    # Update the existing model state dict with the filtered state dict from the checkpoint
+    model_state_dict.update(filtered_state_dict)
+
+    # Load the updated state dict back into the model
+    model.load_state_dict(model_state_dict)
+    # print(f"Loaded keys from checkpoint: {filtered_state_dict.keys()}")
+    print(model)
+    return model
+
+
 # Main function to execute the workflow
 def main(config_file_path, args):
     model, conf = load_and_configure_model(config_file_path)
     if args.resume_pretrained:
         ckpt_path = f"/tmp/bld56_dataset_v1/saved_models/pre_av_ndec_uman_ntok--val_u_wer=0.0809-epoch=11.ckpt"
-        checkpoint = torch.load(ckpt_path)
-        model.load_state_dict(checkpoint['state_dict'])
-        print(model)
+        # checkpoint = torch.load(ckpt_path)
+        # print(checkpoint['state_dict'].keys())
+        # model.load_state_dict(checkpoint['state_dict'])
+        model = selective_load(model, ckpt_path)
         model.cfg.wandb.run_name += 'pre+'
     manage_model_adapters(model, conf)
     
     trainer = setup_trainer()
     model.set_trainer(trainer)
     logdir = setup_exp_manager(trainer, model)
-    trainer.fit(model)
-    # trainer.validate(model)
+    # trainer.fit(model)
+    trainer.validate(model)
 
 if __name__ == "__main__":
     # add config number args
diff --git a/balu_codes/train_av_asr.sh b/balu_codes/train_av_asr.sh
index 6860ae5616c9..7fb09d2ab35f 100644
--- a/balu_codes/train_av_asr.sh
+++ b/balu_codes/train_av_asr.sh
@@ -59,14 +59,14 @@ source activate /home/bld56/.miniconda3/envs/nemo
 # av_ndec_lman_ntok
 # CUDA_VISIBLE_DEVICES=0 /home/bld56/.miniconda3/envs/nemo/bin/python train_av_asr.py --config 5 --snr 0.5 --resume_pretrained True &
 
-sleep 20
+# sleep 40
 # av_ndec_uman_ntok
 # CUDA_VISIBLE_DEVICES=1 /home/bld56/.miniconda3/envs/nemo/bin/python train_av_asr.py --config 9 --snr 0.5 --resume_pretrained True &
 
 # au_ndec_lman_ntok
 CUDA_VISIBLE_DEVICES=0 /home/bld56/.miniconda3/envs/nemo/bin/python train_av_asr.py --config 10 --snr 0.5 --resume_pretrained True &
 
-sleep 20
+sleep 40
 # au_ndec_uman_ntok
 CUDA_VISIBLE_DEVICES=1 /home/bld56/.miniconda3/envs/nemo/bin/python train_av_asr.py --config 11 --snr 0.5 --resume_pretrained True &
 
diff --git a/nemo/collections/asr/data/av_to_text.py b/nemo/collections/asr/data/av_to_text.py
index 55ee5034d81d..5925e9a4810f 100644
--- a/nemo/collections/asr/data/av_to_text.py
+++ b/nemo/collections/asr/data/av_to_text.py
@@ -69,19 +69,19 @@ def _speech_collate_fn(batch, pad_id, get_vid_feats):
     """
     packed_batch = list(zip(*batch))
     if get_vid_feats:
-        if len(packed_batch) == 6:
-            _, audio_lengths, _, _, tokens_lengths, sample_ids = packed_batch
-        elif len(packed_batch) == 5:
+        if len(packed_batch) == 7:
+            _, audio_lengths, _, _, tokens_lengths, labels, sample_ids = packed_batch
+        elif len(packed_batch) == 6:
             sample_ids = None
-            _, audio_lengths, _, _, tokens_lengths = packed_batch
+            _, audio_lengths, _, _, tokens_lengths, labels = packed_batch
         else:
             raise ValueError(f"Expects 5 or 6 tensors in the batch!")
     else:
-        if len(packed_batch) == 4:
+        if len(packed_batch) == 5:
             sample_ids = None
-            _, audio_lengths, _, tokens_lengths = packed_batch
-        elif len(packed_batch) == 5:
-            _, audio_lengths, _, tokens_lengths, sample_ids = packed_batch
+            _, audio_lengths, _, tokens_lengths, labels = packed_batch
+        elif len(packed_batch) == 4:
+            _, audio_lengths, _, tokens_lengths, sample_ids, labels = packed_batch
         else:
             raise ValueError(f"Expects 4 or 5 tensors in the batch!")
     max_audio_len = 0
@@ -90,16 +90,16 @@ def _speech_collate_fn(batch, pad_id, get_vid_feats):
         max_audio_len = max(audio_lengths).item()
     max_tokens_len = max(tokens_lengths).item()
 
-    audio_signal, tokens, video_feat_signal = [], [], []
+    audio_signal, tokens, video_feat_signal, labels = [], [], [], []
     for b in batch:
-        if len(b) == 6 and get_vid_feats:
-            sig, sig_len, video_feat, tokens_i, tokens_i_len, _ = b
-        elif len(b) == 5 and get_vid_feats:
-            sig, sig_len, video_feat, tokens_i, tokens_i_len = b
+        if len(b) == 7 and get_vid_feats:
+            sig, sig_len, video_feat, tokens_i, tokens_i_len, label, _ = b
+        elif len(b) == 6 and get_vid_feats:
+            sig, sig_len, video_feat, tokens_i, tokens_i_len, label = b
+        elif len(b) == 6 and not get_vid_feats:
+            sig, sig_len, tokens_i, tokens_i_len, label, _ = b
         elif len(b) == 5 and not get_vid_feats:
-            sig, sig_len, tokens_i, tokens_i_len, _ = b
-        elif len(b) == 4 and not get_vid_feats:
-            sig, sig_len, tokens_i, tokens_i_len = b
+            sig, sig_len, tokens_i, tokens_i_len, label = b
         if has_audio:
             sig_len = sig_len.item()
             if sig_len < max_audio_len:
@@ -113,6 +113,7 @@ def _speech_collate_fn(batch, pad_id, get_vid_feats):
             pad = (0, max_tokens_len - tokens_i_len)
             tokens_i = torch.nn.functional.pad(tokens_i, pad, value=pad_id)
         tokens.append(tokens_i)
+        labels.append(label)
 
     if has_audio:
         audio_signal = torch.stack(audio_signal)
@@ -123,7 +124,8 @@ def _speech_collate_fn(batch, pad_id, get_vid_feats):
         video_feat_signal = torch.stack(video_feat_signal)
     tokens = torch.stack(tokens)
     tokens_lengths = torch.stack(tokens_lengths)
-    base_output = [audio_signal, audio_lengths, tokens, tokens_lengths]
+    labels = torch.stack(labels)
+    base_output = [audio_signal, audio_lengths, tokens, tokens_lengths, labels]
 
     if get_vid_feats:
         base_output.insert(2, video_feat_signal)
@@ -492,7 +494,8 @@ def _process_sample(self, index):
         f, fl = mixed_features, torch.tensor(mixed_features.shape[0]).long()
 
         # TODO: @Balu, saving audio temporarily
-        # save_audio_path = f"/tmp/bld56_dataset_v1/audioset/temp_sample_check/{index}.wav"
+        # os.makedirs(f"/tmp/bld56_dataset_v1/audioset/temp_sample_check/snr_{self.override_snr_ratio}", exist_ok=True)
+        # save_audio_path = f"/tmp/bld56_dataset_v1/audioset/temp_sample_check/snr_{self.override_snr_ratio}/{sample.video_file.split('/')[-1].split('.')[0]}_{sample.audio_file.split('/')[-1].split('.')[0]}.wav"
         # import torchaudio
         # torchaudio.save(save_audio_path, f.unsqueeze(0), 16000)
         
@@ -513,7 +516,7 @@ def _process_sample(self, index):
         
         t, tl = self.manifest_processor.process_text_by_sample(sample=sample)
 
-        output = [f, fl, torch.tensor(t).long(), torch.tensor(tl).long()]
+        output = [f, fl, torch.tensor(t).long(), torch.tensor(tl).long(), torch.tensor(sample.label).long()]
 
         if self.get_vid_feats:
             output.insert(2, vf)
@@ -526,6 +529,7 @@ def _process_sample(self, index):
         return output
 
     def __len__(self):
+        # return 5
         # return 100
         return len(self.manifest_processor.collection)
 
diff --git a/nemo/collections/asr/models/av_ctc_bpe_models.py b/nemo/collections/asr/models/av_ctc_bpe_models.py
index 474beb0796a2..cb2b59949e75 100644
--- a/nemo/collections/asr/models/av_ctc_bpe_models.py
+++ b/nemo/collections/asr/models/av_ctc_bpe_models.py
@@ -270,6 +270,7 @@ def change_vocabulary(
             zero_infinity=True,
             reduction=self._cfg.get("ctc_reduction", "mean_batch"),
         )
+        self.ce_loss = torch.nn.CrossEntropyLoss()
 
         if decoding_cfg is None:
             # Assume same decoding config as before
diff --git a/nemo/collections/asr/models/av_ctc_models.py b/nemo/collections/asr/models/av_ctc_models.py
index 6edaa497b4b7..521b2252924b 100644
--- a/nemo/collections/asr/models/av_ctc_models.py
+++ b/nemo/collections/asr/models/av_ctc_models.py
@@ -109,7 +109,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
             self.v_linear = torch.nn.Linear(in_features = self.cfg.v_model.feat_dim, out_features = self.cfg.av_encoder.d_model)
             self.av_enocder_layer = torch.nn.TransformerEncoderLayer(d_model = self.cfg.av_encoder.d_model, nhead = self.cfg.av_encoder.nhead, dropout = self.cfg.av_encoder.dropout, batch_first=True)
             self.av_encoder = torch.nn.TransformerEncoder(self.av_enocder_layer, num_layers = self.cfg.av_encoder.num_layers)
-        
+            if cfg.label_pred_head.keep:
+                self.cls_token = torch.nn.Parameter(torch.randn(1, 1, self.cfg.av_encoder.d_model))
+                self.label_predictor = torch.nn.Linear(self.cfg.av_encoder.d_model, self.cfg.label_pred_head.num_classes)
+            self.ce_loss = torch.nn.CrossEntropyLoss()
             # Modality embeddings
             self.a_modal_embs = torch.nn.Embedding(1, self.cfg.av_encoder.d_model)
             self.v_modal_embs = torch.nn.Embedding(1, self.cfg.av_encoder.d_model)
@@ -243,6 +246,7 @@ def change_vocabulary(self, new_vocabulary: List[str], decoding_cfg: Optional[Di
                 zero_infinity=True,
                 reduction=self._cfg.get("ctc_reduction", "mean_batch"),
             )
+            self.ce_loss = torch.nn.CrossEntropyLoss()
 
             if decoding_cfg is None:
                 # Assume same decoding config as before
@@ -482,6 +486,7 @@ def input_types(self) -> Optional[Dict[str, NeuralType]]:
     def output_types(self) -> Optional[Dict[str, NeuralType]]:
         return {
             "outputs": NeuralType(('B', 'T', 'D'), LogprobsType()),
+            "label_log_probs": NeuralType(('B', 'C'), LogprobsType(), optional=True),
             "encoded_lengths": NeuralType(tuple('B'), LengthsType()),
             "greedy_predictions": NeuralType(('B', 'T'), LabelsType()),
         }
@@ -556,28 +561,42 @@ def forward(
             
             # Concat and pass them through the transformer encoder
             av_encoded = torch.cat((a_encoded, v_encoded), dim=1)
+            if self.cfg.label_pred_head.keep:
+                cls_token = self.cls_token.expand(encoded.size(0), -1, -1)  # Expanding to batch size
+                av_encoded = torch.cat((cls_token, av_encoded), dim=1)  # Concatenating classifier token
             av_encoded = self.av_encoder(av_encoded)
             
-            # remove the v_encoded tokens
-            av_encoded = av_encoded[:, :T, :]
+            if self.cfg.label_pred_head.keep:
+                # remove the v_encoded tokens
+                av_encoded = av_encoded[:, :T, :]
             
             # B,T,C -> B,C,T
             av_encoded = av_encoded.permute(0, 2, 1)
             
-            # remove 
+            if self.cfg.label_pred_head.keep:
+                # Predicting labels using the classifier token's output
+                cls_output = av_encoded[:, :, 0]  # First token after encoding is the classifier token
+                label_log_probs = self.label_predictor(cls_output)
+            else:
+                label_log_probs = None
+            
             log_probs = self.decoder(encoder_output=av_encoded)
             greedy_predictions = log_probs.argmax(dim=-1, keepdim=False)
         elif (not self.cfg.use_video_modality) and (not self.cfg.use_pretrained_dec):
             log_probs = self.decoder(encoder_output=encoded)
             greedy_predictions = log_probs.argmax(dim=-1, keepdim=False)
+            label_log_probs = None
         elif (not self.cfg.use_video_modality) and self.cfg.use_pretrained_dec:
             log_probs = self.a_model.decoder(encoder_output=encoded)
             greedy_predictions = log_probs.argmax(dim=-1, keepdim=False)
+            label_log_probs = None
         elif self.cfg.use_video_modality and self.cfg.use_pretrained_dec:
             raise ValueError("Pretrained decoder is not supported for video modality")
         
+    
         return (
             log_probs,
+            label_log_probs,
             encoded_len,
             greedy_predictions,
         )
@@ -591,13 +610,13 @@ def training_step(self, batch, batch_nb):
         if self.is_interctc_enabled():
             AccessMixin.set_access_enabled(access_enabled=True, guid=self.model_guid)
 
-        signal, signal_len, video_input_signal, transcript, transcript_len = batch
+        signal, signal_len, video_input_signal, transcript, transcript_len, label = batch
         # if isinstance(batch, DALIOutputs) and batch.has_processed_signal:
         #     log_probs, encoded_len, predictions = self.forward(
         #         processed_signal=signal, processed_signal_length=signal_len
         #     )
         # else:
-        log_probs, encoded_len, predictions = self.forward(audio_input_signal=signal, audio_input_signal_length=signal_len, video_input_signal=video_input_signal)
+        log_probs, label_log_probs, encoded_len, predictions = self.forward(audio_input_signal=signal, audio_input_signal_length=signal_len, video_input_signal=video_input_signal)
 
         if hasattr(self, '_trainer') and self._trainer is not None:
             log_every_n_steps = self._trainer.log_every_n_steps
@@ -607,9 +626,14 @@ def training_step(self, batch, batch_nb):
         loss_value = self.loss(
             log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len
         )
-
-        # Add auxiliary losses, if registered
-        loss_value = self.add_auxiliary_losses(loss_value)
+        if self.cfg.label_pred_head.keep:
+            label_loss_value = self.ce_loss(label_log_probs, label)
+            # Add auxiliary losses, if registered
+            loss_value = self.add_auxiliary_losses(loss_value+label_loss_value)
+        else:
+            label_loss_value = 0.0
+            loss_value = self.add_auxiliary_losses(loss_value)
+            
         # only computing WER when requested in the logs (same as done for final-layer WER below)
         loss_value, tensorboard_logs = self.add_interctc_losses(
             loss_value, transcript, transcript_len, compute_wer=((batch_nb + 1) % log_every_n_steps == 0)
@@ -619,6 +643,9 @@ def training_step(self, batch, batch_nb):
         if AccessMixin.is_access_enabled(self.model_guid):
             AccessMixin.reset_registry(self)
 
+        if self.cfg.label_pred_head.keep:
+            tensorboard_logs.update({'train_label_loss': label_loss_value})
+        
         tensorboard_logs.update(
             {
                 'train_loss': loss_value,
@@ -651,16 +678,16 @@ def training_step(self, batch, batch_nb):
                 tensorboard_logs.update({'train_acc': acc})
                 self.log('train_acc', acc, on_step=True, on_epoch=False)
 
-        return {'loss': loss_value, 'log': tensorboard_logs}
+        return {'loss': loss_value+label_loss_value, 'log': tensorboard_logs}
 
     def predict_step(self, batch, batch_idx, dataloader_idx=0):
-        signal, signal_len, video_input_signal, transcript, transcript_len, sample_id = batch
+        signal, signal_len, video_input_signal, transcript, transcript_len, label, sample_id = batch
         # if isinstance(batch, DALIOutputs) and batch.has_processed_signal:
         #     log_probs, encoded_len, predictions = self.forward(
         #         processed_signal=signal, processed_signal_length=signal_len
         #     )
         # else:
-        log_probs, encoded_len, predictions = self.forward(audio_input_signal=signal, audio_input_signal_length=signal_len, video_input_signal=video_input_signal)
+        log_probs, label_loss_value, encoded_len, predictions = self.forward(audio_input_signal=signal, audio_input_signal_length=signal_len, video_input_signal=video_input_signal)
 
         transcribed_texts, _ = self.wer.decoding.ctc_decoder_predictions_tensor(
             decoder_outputs=log_probs, decoder_lengths=encoded_len, return_hypotheses=False,
@@ -673,17 +700,21 @@ def validation_pass(self, batch, batch_idx, dataloader_idx=0):
         if self.is_interctc_enabled():
             AccessMixin.set_access_enabled(access_enabled=True, guid=self.model_guid)
 
-        signal, signal_len, video_input_signal, transcript, transcript_len = batch
+        signal, signal_len, video_input_signal, transcript, transcript_len, label = batch
         # if isinstance(batch, DALIOutputs) and batch.has_processed_signal:
         #     log_probs, encoded_len, predictions = self.forward(
         #         processed_signal=signal, processed_signal_length=signal_len
         #     )
         # else:
-        log_probs, encoded_len, predictions = self.forward(audio_input_signal=signal, audio_input_signal_length=signal_len, video_input_signal=video_input_signal)
+        log_probs, label_log_probs, encoded_len, predictions = self.forward(audio_input_signal=signal, audio_input_signal_length=signal_len, video_input_signal=video_input_signal)
 
         loss_value = self.loss(
             log_probs=log_probs, targets=transcript, input_lengths=encoded_len, target_lengths=transcript_len
         )
+        if self.cfg.label_pred_head.keep:
+            label_loss_value = self.ce_loss(label_log_probs, label)
+        else:
+            label_loss_value = 0.0
         loss_value, metrics = self.add_interctc_losses(
             loss_value, transcript, transcript_len, compute_wer=True, log_wer_num_denom=True, log_prefix="val_",
         )
@@ -694,7 +725,8 @@ def validation_pass(self, batch, batch_idx, dataloader_idx=0):
         # wer, wer_num, wer_denom = self.wer.compute()
         labelled_wer, unlabelled_wer, acc, scores_unlabelled, words_unlabelled = self.wer.compute()
         self.wer.reset()
-        metrics.update({'val_loss': loss_value, 'val_labelled_wer': labelled_wer, 'val_unlabelled_wer': unlabelled_wer, 'val_acc': acc, 'val_wer_num': scores_unlabelled, 'val_wer_denom': words_unlabelled})
+        metrics.update({'val_loss': loss_value, 'val_label_loss': label_loss_value,
+                        'val_labelled_wer': labelled_wer, 'val_unlabelled_wer': unlabelled_wer, 'val_acc': acc, 'val_wer_num': scores_unlabelled, 'val_wer_denom': words_unlabelled})
         
         # self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32))
         if labelled_wer is not None:
@@ -704,6 +736,8 @@ def validation_pass(self, batch, batch_idx, dataloader_idx=0):
         if acc is not None:
             self.log('val_acc', acc, on_epoch=True, sync_dist=True)
         self.log('val_loss', loss_value, sync_dist=True)
+        if self.cfg.label_pred_head.keep:
+            self.log('val_label_loss', label_loss_value, sync_dist=True)
         
 
         # Reset access registry
diff --git a/nemo/collections/common/parts/preprocessing/collections.py b/nemo/collections/common/parts/preprocessing/collections.py
index fca016b477fe..3b5f65228e8b 100644
--- a/nemo/collections/common/parts/preprocessing/collections.py
+++ b/nemo/collections/common/parts/preprocessing/collections.py
@@ -209,7 +209,7 @@ class AVText(_Collection):
 
     AV_OUTPUT_TYPE = collections.namedtuple(
         typename='AVTextEntity',
-        field_names='id audio_file video_file video_featfile duration text_tokens snr offset text_raw speaker orig_sr lang',
+        field_names='id audio_file video_file video_featfile duration text_tokens snr offset text_raw speaker orig_sr lang label',
     )
 
     def __init__(
@@ -226,6 +226,7 @@ def __init__(
         orig_sampling_rates: List[Optional[int]],
         token_labels: List[Optional[int]],
         langs: List[Optional[str]],
+        labels: List[Optional[str]],
         parser: parsers.CharParser,
         min_duration: Optional[float] = None,
         max_duration: Optional[float] = None,
@@ -260,8 +261,8 @@ def __init__(
         if index_by_file_id:
             self.mapping = {}
 
-        for id_, audio_file, video_file, video_featfile, duration, offset, text, snr_ratio, speaker, orig_sr, token_labels, lang in zip(
-            ids, audio_files, video_files, video_featfiles, durations, offsets, texts, snr_ratios, speakers, orig_sampling_rates, token_labels, langs
+        for id_, audio_file, video_file, video_featfile, duration, offset, text, snr_ratio, speaker, orig_sr, token_labels, lang, label in zip(
+            ids, audio_files, video_files, video_featfiles, durations, offsets, texts, snr_ratios, speakers, orig_sampling_rates, token_labels, langs, labels
         ):
             # Duration filters.
             if min_duration is not None and duration < min_duration:
@@ -296,11 +297,16 @@ def __init__(
                     duration_filtered += duration
                     num_filtered += 1
                     continue
-
+            
+            if label is not None: # <N1>
+                # replace <,>,N and then convert to int
+                label = int(label.replace('<', '').replace('>', '').replace('N', ''))
+                label = label - 1 # 0-indexed
+            
             total_duration += duration
 
             data.append(output_type(id_, audio_file, video_file, video_featfile, duration,
-                        text_tokens, snr_ratio, offset, text, speaker, orig_sr, lang))
+                        text_tokens, snr_ratio, offset, text, speaker, orig_sr, lang, label))
             if index_by_file_id:
                 file_id, _ = os.path.splitext(os.path.basename(audio_file))
                 if file_id not in self.mapping:
@@ -488,7 +494,8 @@ def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs):
             **kwargs: Kwargs to pass to `AVText` constructor.
         """
 
-        ids, audio_files, video_files, durations, texts, offsets, video_featfiles, snr_ratios = (
+        ids, audio_files, video_files, durations, texts, offsets, video_featfiles, snr_ratios, labels = (
+            [],
             [],
             [],
             [],
@@ -512,8 +519,9 @@ def __init__(self, manifests_files: Union[str, List[str]], *args, **kwargs):
             orig_srs.append(item['orig_sr'])
             token_labels.append(item['token_labels'])
             langs.append(item['lang'])
+            labels.append(item['label'])
         super().__init__(
-            ids, audio_files, video_files, video_featfiles, durations, snr_ratios, texts, offsets, speakers, orig_srs, token_labels, langs, *args, **kwargs
+            ids, audio_files, video_files, video_featfiles, durations, snr_ratios, texts, offsets, speakers, orig_srs, token_labels, langs, labels, *args, **kwargs
         )
 
 
diff --git a/nemo/collections/common/parts/preprocessing/manifest.py b/nemo/collections/common/parts/preprocessing/manifest.py
index 773bbb13e90b..fba05937d7b5 100644
--- a/nemo/collections/common/parts/preprocessing/manifest.py
+++ b/nemo/collections/common/parts/preprocessing/manifest.py
@@ -240,7 +240,6 @@ def av_item_iter(
                     errors[str(manifest_file)].append(line)
                     continue
                 item['id'] = k
-
                 yield item
 
     if len(errors) > 0:
@@ -330,6 +329,11 @@ def __av_parse_item(line: str, manifest_file: str) -> Dict[str, Any]:
     else:
         item['snr'] = None
         
+    if 'label' in item:
+        item['label'] = item.pop('label')
+    else:
+        item['label'] = None
+        
     item = dict(
         audio_file=item.get('audio_file', None),
         video_file=item.get('video_file', None),
@@ -343,6 +347,7 @@ def __av_parse_item(line: str, manifest_file: str) -> Dict[str, Any]:
         token_labels=item.get('token_labels', None),
         lang=item.get('lang', None),
         snr_ratio=item.get('snr', None),
+        label=item.get('label', None),
     )
     return item
 
diff --git a/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb b/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb
index cdc0afdf5a81..d3db783559a3 100644
--- a/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb
+++ b/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb
@@ -2,11 +2,22 @@
   "cells": [
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 1,
       "metadata": {
         "id": "EGV_ioUHqhun"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "'\\nRemember to restart the runtime for the kernel to pick up any upgraded packages (e.g. matplotlib)!\\nAlternatively, you can uncomment the exit() below to crash and restart the kernel, in the case\\nthat you want to use the \"Run All Cells\" (or similar) option.\\n'"
+            ]
+          },
+          "execution_count": 1,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "\"\"\"\n",
         "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
@@ -23,14 +34,14 @@
         "\"\"\"\n",
         "\n",
         "# Install dependencies\n",
-        "!pip install wget\n",
-        "!apt-get install sox libsndfile1 ffmpeg libsox-fmt-mp3\n",
-        "!pip install text-unidecode\n",
-        "!pip install matplotlib>=3.3.2\n",
+        "# !pip install wget\n",
+        "# !apt-get install sox libsndfile1 ffmpeg libsox-fmt-mp3\n",
+        "# !pip install text-unidecode\n",
+        "# !pip install matplotlib>=3.3.2\n",
         "\n",
         "## Install NeMo\n",
         "BRANCH = 'main'\n",
-        "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
+        "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
         "\n",
         "\"\"\"\n",
         "Remember to restart the runtime for the kernel to pick up any upgraded packages (e.g. matplotlib)!\n",
@@ -159,7 +170,7 @@
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "b900b618b438414dacc6cd9c7928a8ef",
+              "model_id": "f4940f6a57be4278b1fc21d1266f7289",
               "version_major": 2,
               "version_minor": 0
             },
@@ -230,25 +241,7 @@
       "metadata": {
         "id": "Q2NbhCNBoHdq"
       },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "--2024-06-26 19:36:46--  https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/speech_recognition/convert_hf_dataset_to_nemo.py\n",
-            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...\n",
-            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
-            "HTTP request sent, awaiting response... 200 OK\n",
-            "Length: 14735 (14K) [text/plain]\n",
-            "Saving to: ‘convert_hf_dataset_to_nemo.py’\n",
-            "\n",
-            "convert_hf_dataset_ 100%[===================>]  14.39K  --.-KB/s    in 0s      \n",
-            "\n",
-            "2024-06-26 19:36:46 (213 MB/s) - ‘convert_hf_dataset_to_nemo.py’ saved [14735/14735]\n",
-            "\n"
-          ]
-        }
-      ],
+      "outputs": [],
       "source": [
         "if not os.path.exists(\"convert_hf_dataset_to_nemo.py\"):\n",
         "    !wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/speech_recognition/convert_hf_dataset_to_nemo.py\n"
@@ -256,7 +249,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 11,
+      "execution_count": 10,
       "metadata": {
         "id": "Inwx4OE97guu"
       },
@@ -273,26 +266,40 @@
             "/usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py:119: UserWarning: Future Hydra versions will no longer change working directory at job runtime by default.\n",
             "See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.\n",
             "  ret = run_job(\n",
-            "/usr/local/lib/python3.10/dist-packages/datasets/load.py:1454: FutureWarning: The repository for mozilla-foundation/common_voice_6_1 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/mozilla-foundation/common_voice_6_1\n",
-            "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
-            "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
-            "  warnings.warn(\n",
-            "Downloading builder script: 100%|██████████| 11.5k/11.5k [00:00<00:00, 45.5MB/s]\n",
-            "Downloading readme: 100%|██████████████████| 10.8k/10.8k [00:00<00:00, 1.65MB/s]\n",
-            "Downloading extra modules: 100%|███████████| 3.29k/3.29k [00:00<00:00, 35.1MB/s]\n",
-            "Downloading extra modules: 100%|████████████| 39.9k/39.9k [00:00<00:00, 325kB/s]\n",
-            "Downloading data: 100%|██████████████████████| 153M/153M [00:05<00:00, 27.9MB/s]\n",
-            "Generating train split: 722 examples [00:01, 665.83 examples/s] \n",
-            "Generating test split: 632 examples [00:01, 583.76 examples/s]\n",
-            "Generating validation split: 586 examples [00:01, 545.90 examples/s]\n",
-            "Generating other split: 885 examples [00:01, 813.03 examples/s] \n",
-            "Generating invalidated split: 504 examples [00:01, 472.40 examples/s] \n",
-            "Single split found for dataset mozilla-foundation/common_voice_6_1 | Split chosen = train\n",
-            "Map (num_proc=8): 100%|████████████████| 722/722 [00:07<00:00, 91.68 examples/s]\n",
-            "Processing mozilla-foundation/common_voice_6_1 (split : train):: 100%|█| 722/722\n",
+            "HuggingFace datasets failed due to some reason (stack trace below). \n",
+            "For certain datasets (eg: MCV), it may be necessary to login to the huggingface-cli (via `huggingface-cli login`).\n",
+            "Once logged in, you need to set `use_auth_token=True` when calling this script.\n",
             "\n",
-            "Dataset conversion finished !\n",
-            "\u001b[0m/workspace/nemo/NeMo-opensource/tutorials/asr/convert_hf_dataset_to_nemo.py:346: UserWarning: \n",
+            "Traceback error for reference :\n",
+            "\n",
+            "Traceback (most recent call last):\n",
+            "  File \"/workspace/nemo/NeMo-opensource/tutorials/asr/convert_hf_dataset_to_nemo.py\", line 358, in main\n",
+            "    dataset = load_dataset(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 2548, in load_dataset\n",
+            "    builder_instance = load_dataset_builder(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 2220, in load_dataset_builder\n",
+            "    dataset_module = dataset_module_factory(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 1871, in dataset_module_factory\n",
+            "    raise e1 from None\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 1816, in dataset_module_factory\n",
+            "    raise e\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 1790, in dataset_module_factory\n",
+            "    dataset_info = hf_api.dataset_info(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_validators.py\", line 118, in _inner_fn\n",
+            "    return fn(*args, **kwargs)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/hf_api.py\", line 2137, in dataset_info\n",
+            "    headers = self._build_hf_headers(token=token)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/hf_api.py\", line 8191, in _build_hf_headers\n",
+            "    return build_hf_headers(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_validators.py\", line 118, in _inner_fn\n",
+            "    return fn(*args, **kwargs)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_headers.py\", line 121, in build_hf_headers\n",
+            "    token_to_send = get_token_to_send(token)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_headers.py\", line 153, in get_token_to_send\n",
+            "    raise LocalTokenNotFoundError(\n",
+            "huggingface_hub.utils._headers.LocalTokenNotFoundError: Token is required (`token=True`), but no token found. You need to provide a token or be logged in to Hugging Face with `huggingface-cli login` or `huggingface_hub.login`. See https://huggingface.co/settings/tokens.\n",
+            "\n",
+            "/workspace/nemo/NeMo-opensource/tutorials/asr/convert_hf_dataset_to_nemo.py:346: UserWarning: \n",
             "The version_base parameter is not specified.\n",
             "Please specify a compatability version level, or None.\n",
             "Will assume defaults for version 1.1\n",
@@ -300,16 +307,40 @@
             "/usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py:119: UserWarning: Future Hydra versions will no longer change working directory at job runtime by default.\n",
             "See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.\n",
             "  ret = run_job(\n",
-            "/usr/local/lib/python3.10/dist-packages/datasets/load.py:1454: FutureWarning: The repository for mozilla-foundation/common_voice_6_1 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/mozilla-foundation/common_voice_6_1\n",
-            "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
-            "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
-            "  warnings.warn(\n",
-            "Single split found for dataset mozilla-foundation/common_voice_6_1 | Split chosen = validation\n",
-            "Map (num_proc=8): 100%|████████████████| 586/586 [00:06<00:00, 85.82 examples/s]\n",
-            "Processing mozilla-foundation/common_voice_6_1 (split : validation):: 100%|█| 58\n",
+            "HuggingFace datasets failed due to some reason (stack trace below). \n",
+            "For certain datasets (eg: MCV), it may be necessary to login to the huggingface-cli (via `huggingface-cli login`).\n",
+            "Once logged in, you need to set `use_auth_token=True` when calling this script.\n",
             "\n",
-            "Dataset conversion finished !\n",
-            "\u001b[0m/workspace/nemo/NeMo-opensource/tutorials/asr/convert_hf_dataset_to_nemo.py:346: UserWarning: \n",
+            "Traceback error for reference :\n",
+            "\n",
+            "Traceback (most recent call last):\n",
+            "  File \"/workspace/nemo/NeMo-opensource/tutorials/asr/convert_hf_dataset_to_nemo.py\", line 358, in main\n",
+            "    dataset = load_dataset(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 2548, in load_dataset\n",
+            "    builder_instance = load_dataset_builder(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 2220, in load_dataset_builder\n",
+            "    dataset_module = dataset_module_factory(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 1871, in dataset_module_factory\n",
+            "    raise e1 from None\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 1816, in dataset_module_factory\n",
+            "    raise e\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 1790, in dataset_module_factory\n",
+            "    dataset_info = hf_api.dataset_info(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_validators.py\", line 118, in _inner_fn\n",
+            "    return fn(*args, **kwargs)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/hf_api.py\", line 2137, in dataset_info\n",
+            "    headers = self._build_hf_headers(token=token)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/hf_api.py\", line 8191, in _build_hf_headers\n",
+            "    return build_hf_headers(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_validators.py\", line 118, in _inner_fn\n",
+            "    return fn(*args, **kwargs)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_headers.py\", line 121, in build_hf_headers\n",
+            "    token_to_send = get_token_to_send(token)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_headers.py\", line 153, in get_token_to_send\n",
+            "    raise LocalTokenNotFoundError(\n",
+            "huggingface_hub.utils._headers.LocalTokenNotFoundError: Token is required (`token=True`), but no token found. You need to provide a token or be logged in to Hugging Face with `huggingface-cli login` or `huggingface_hub.login`. See https://huggingface.co/settings/tokens.\n",
+            "\n",
+            "/workspace/nemo/NeMo-opensource/tutorials/asr/convert_hf_dataset_to_nemo.py:346: UserWarning: \n",
             "The version_base parameter is not specified.\n",
             "Please specify a compatability version level, or None.\n",
             "Will assume defaults for version 1.1\n",
@@ -317,16 +348,39 @@
             "/usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py:119: UserWarning: Future Hydra versions will no longer change working directory at job runtime by default.\n",
             "See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.\n",
             "  ret = run_job(\n",
-            "/usr/local/lib/python3.10/dist-packages/datasets/load.py:1454: FutureWarning: The repository for mozilla-foundation/common_voice_6_1 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/mozilla-foundation/common_voice_6_1\n",
-            "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
-            "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
-            "  warnings.warn(\n",
-            "Single split found for dataset mozilla-foundation/common_voice_6_1 | Split chosen = test\n",
-            "Map (num_proc=8): 100%|████████████████| 632/632 [00:08<00:00, 74.60 examples/s]\n",
-            "Processing mozilla-foundation/common_voice_6_1 (split : test):: 100%|█| 632/632 \n",
+            "HuggingFace datasets failed due to some reason (stack trace below). \n",
+            "For certain datasets (eg: MCV), it may be necessary to login to the huggingface-cli (via `huggingface-cli login`).\n",
+            "Once logged in, you need to set `use_auth_token=True` when calling this script.\n",
             "\n",
-            "Dataset conversion finished !\n",
-            "\u001b[0m"
+            "Traceback error for reference :\n",
+            "\n",
+            "Traceback (most recent call last):\n",
+            "  File \"/workspace/nemo/NeMo-opensource/tutorials/asr/convert_hf_dataset_to_nemo.py\", line 358, in main\n",
+            "    dataset = load_dataset(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 2548, in load_dataset\n",
+            "    builder_instance = load_dataset_builder(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 2220, in load_dataset_builder\n",
+            "    dataset_module = dataset_module_factory(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 1871, in dataset_module_factory\n",
+            "    raise e1 from None\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 1816, in dataset_module_factory\n",
+            "    raise e\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 1790, in dataset_module_factory\n",
+            "    dataset_info = hf_api.dataset_info(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_validators.py\", line 118, in _inner_fn\n",
+            "    return fn(*args, **kwargs)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/hf_api.py\", line 2137, in dataset_info\n",
+            "    headers = self._build_hf_headers(token=token)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/hf_api.py\", line 8191, in _build_hf_headers\n",
+            "    return build_hf_headers(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_validators.py\", line 118, in _inner_fn\n",
+            "    return fn(*args, **kwargs)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_headers.py\", line 121, in build_hf_headers\n",
+            "    token_to_send = get_token_to_send(token)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_headers.py\", line 153, in get_token_to_send\n",
+            "    raise LocalTokenNotFoundError(\n",
+            "huggingface_hub.utils._headers.LocalTokenNotFoundError: Token is required (`token=True`), but no token found. You need to provide a token or be logged in to Hugging Face with `huggingface-cli login` or `huggingface_hub.login`. See https://huggingface.co/settings/tokens.\n",
+            "\n"
           ]
         }
       ],
@@ -376,7 +430,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 12,
+      "execution_count": 11,
       "metadata": {
         "id": "j7WAGLX59C26"
       },
@@ -389,7 +443,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 13,
+      "execution_count": 12,
       "metadata": {},
       "outputs": [
         {
@@ -398,7 +452,7 @@
               "'datasets/ja/mozilla-foundation/common_voice_6_1/ja'"
             ]
           },
-          "execution_count": 13,
+          "execution_count": 12,
           "metadata": {},
           "output_type": "execute_result"
         }
@@ -435,7 +489,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 14,
+      "execution_count": 13,
       "metadata": {
         "id": "EdkJYxUirp7C"
       },
@@ -460,7 +514,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 15,
+      "execution_count": 14,
       "metadata": {
         "id": "HngfzcwOijy4"
       },
@@ -482,7 +536,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 16,
+      "execution_count": 15,
       "metadata": {
         "id": "T2iwnvhXimfG"
       },
@@ -506,7 +560,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 17,
+      "execution_count": 16,
       "metadata": {
         "id": "XpUb_pI5imhh"
       },
@@ -525,7 +579,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 18,
+      "execution_count": 17,
       "metadata": {
         "id": "obcPlrOJimju"
       },
@@ -533,7 +587,7 @@
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "7a31ac5a54444b02b1bef6148a6fbf1d",
+              "model_id": "4a3894d4e36c40c69773f6dd988560ba",
               "version_major": 2,
               "version_minor": 0
             },
@@ -547,7 +601,7 @@
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "6c8c313765bf4cdc8510cb3ff7f8374e",
+              "model_id": "d170c9e37ca849698ee7ff985f939c2f",
               "version_major": 2,
               "version_minor": 0
             },
@@ -561,7 +615,7 @@
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "41e7c23a68304e76a60de007938a9977",
+              "model_id": "6e1b969e796747a08f8cecd0d9a1ca92",
               "version_major": 2,
               "version_minor": 0
             },
@@ -590,7 +644,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 19,
+      "execution_count": 18,
       "metadata": {
         "id": "Z8QVdph6imlz"
       },
@@ -602,7 +656,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 20,
+      "execution_count": 19,
       "metadata": {
         "id": "NgCfETWNimn3"
       },
@@ -660,7 +714,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 21,
+      "execution_count": 20,
       "metadata": {
         "id": "KPrBi35Cimqc"
       },
@@ -671,7 +725,7 @@
           "text": [
             "Number of OOV tokens in test set : 178\n",
             "\n",
-            "{'照', '顧', '筋', '瀬', '懐', '釣', '旋', '昔', '孫', '因', '負', '餐', 'Ｄ', '遭', '硬', '奏', '具', '景', '印', '翻', '弊', '躇', '襲', '命', '床', '叩', '皆', '珍', '却', '縁', '艇', '浅', '震', '粉', '完', '紫', '洒', '盛', '追', '層', '繕', '殖', '〇', '捨', '塵', '淵', '穴', '帝', '署', '繋', '肥', '伝', '協', '旧', '触', 'Ｇ', '謙', '溢', '郊', '劣', '柱', '劇', '効', '償', '諸', '賄', 'Ｐ', '丸', '否', '南', '浸', '歓', '岩', '卒', '郡', '届', '碑', '滅', '灯', '犠', '純', '牲', '像', '胴', '扉', '瞬', '裕', '君', '退', '躊', '翌', '綴', '器', '偉', '領', '採', '訪', '接', '鍵', '県', '占', '眺', '掌', '岸', '冠', '懸', '吉', '袋', '髣', '級', '祖', '温', '区', '殿', '寵', '往', '遇', '乾', '躍', '璧', '税', '肘', '渠', '噂', '幅', '刑', '承', '融', '草', '壮', '挙', '冊', '森', '士', '苑', '忠', '異', '識', '希', '衣', '概', '砲', '打', '垢', '纏', '憶', '処', '鞄', '宴', '可', '騒', '僕', '麗', '嶋', '棋', '駐', '拍', '髴', '塁', '抑', '称', '彩', '示', '殻', '件', '極', '賂', '罅', '擬', '獅', '闘', 'ぷ', '既', '剥', '茂', '湯', '獲', '？'}\n"
+            "{'否', '称', '純', '躊', '茂', '区', '承', '壮', '掌', '希', '嶋', '粉', '肘', '郡', '劇', '示', '器', '概', '謙', '洒', '罅', '躍', '草', '識', '森', '効', '融', '冠', '偉', '叩', '浅', '退', '震', '処', '憶', '塵', '接', '塁', '瀬', '浸', '旧', 'Ｇ', '盛', '髴', '皆', '税', '訪', '級', '鍵', '君', '層', '璧', '淵', '獅', '遭', '躇', '渠', '完', '冊', '艇', '遇', '賄', '占', '殖', '件', '滅', '擬', '岸', '温', '髣', '？', '柱', '珍', '溢', '捨', '岩', '却', '牲', '領', '触', '負', '砲', '昔', '往', '賂', '打', '裕', '追', '抑', '犠', '翌', '卒', '〇', '殿', '苑', '帝', '綴', '拍', '袋', '駐', '景', '縁', '穴', '南', '因', '騒', '郊', '餐', '衣', '命', '極', '歓', '硬', '胴', '諸', '殻', '寵', '照', '印', '襲', '湯', '償', '釣', '忠', '協', '繋', '扉', '採', '灯', '宴', '刑', '旋', '孫', '乾', '繕', 'Ｐ', '麗', '懸', '懐', '可', '紫', 'Ｄ', '顧', '床', '弊', '鞄', '瞬', '具', '垢', '碑', '像', '筋', '既', '翻', '剥', '闘', '劣', '祖', '噂', '異', '獲', '眺', '肥', '纏', '棋', '伝', '僕', '奏', '吉', 'ぷ', '丸', '県', '彩', '幅', '士', '挙', '署', '届'}\n"
           ]
         }
       ],
@@ -710,7 +764,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 22,
+      "execution_count": 21,
       "metadata": {
         "id": "VDDiXCiPimr_"
       },
@@ -738,7 +792,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 23,
+      "execution_count": 22,
       "metadata": {
         "id": "TJeVEKvAimwE"
       },
@@ -767,7 +821,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 24,
+      "execution_count": 23,
       "metadata": {
         "id": "rKULANgINqbq"
       },
@@ -804,7 +858,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 25,
+      "execution_count": 24,
       "metadata": {
         "id": "9G6laS0ojV-B"
       },
@@ -842,7 +896,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 26,
+      "execution_count": 25,
       "metadata": {
         "id": "jnh_pnL2jWAY"
       },
@@ -891,7 +945,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 27,
+      "execution_count": 26,
       "metadata": {
         "cellView": "form",
         "id": "kaX9WzK15Q6t"
@@ -905,7 +959,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 28,
+      "execution_count": 27,
       "metadata": {
         "id": "HiEZVEshOp-y"
       },
@@ -920,7 +974,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 29,
+      "execution_count": 28,
       "metadata": {
         "id": "pV4kOgpvjWGg"
       },
@@ -960,7 +1014,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 30,
+      "execution_count": 29,
       "metadata": {
         "id": "NN3asqvsrp_S"
       },
@@ -1003,7 +1057,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 31,
+      "execution_count": 30,
       "metadata": {
         "id": "mwNtHeHLjqJl"
       },
@@ -1021,7 +1075,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 32,
+      "execution_count": 31,
       "metadata": {
         "id": "xB06YHmDr-Ja"
       },
@@ -1037,7 +1091,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 33,
+      "execution_count": 32,
       "metadata": {
         "id": "4lqUvpkrr7bQ"
       },
@@ -1045,7 +1099,7 @@
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "1077e49a05ca4aab8a4c6ef5f4b356f6",
+              "model_id": "85d3ee86b95547fc9781be780c913166",
               "version_major": 2,
               "version_minor": 0
             },
@@ -1059,7 +1113,7 @@
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "b646ff7faf5e4d80b21c3a1ce3f8de87",
+              "model_id": "0bc288fa81e54ba1bc0dc5fd60180ecc",
               "version_major": 2,
               "version_minor": 0
             },
@@ -1073,7 +1127,7 @@
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "5895cc18274648a8a40b64d71ceee116",
+              "model_id": "ef828a1a17ea40f384fd0d67c048a1e0",
               "version_major": 2,
               "version_minor": 0
             },
@@ -1094,7 +1148,7 @@
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "cc6874e918ce4c408a69630ac7767788",
+              "model_id": "4de0c06375004f1fb7cd65d24918f4e3",
               "version_major": 2,
               "version_minor": 0
             },
@@ -1108,7 +1162,7 @@
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "d80fd3184828476b95bc80232253a5bb",
+              "model_id": "adba0f98dcf843e9b653ad631c1fcae7",
               "version_major": 2,
               "version_minor": 0
             },
@@ -1122,7 +1176,7 @@
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "aefd8a315594441fa06974ff83070f07",
+              "model_id": "52761487aad74046ace1677cc115f5be",
               "version_major": 2,
               "version_minor": 0
             },
@@ -1143,7 +1197,7 @@
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "95ddca797e85407499072648f4bb110c",
+              "model_id": "e703faee525a4c10896c55e43f5c7382",
               "version_major": 2,
               "version_minor": 0
             },
@@ -1157,7 +1211,7 @@
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "5ad49631bc644624bc0cac82279ba2e7",
+              "model_id": "587c9a12a67c4f579eae29d19c4e6076",
               "version_major": 2,
               "version_minor": 0
             },
@@ -1171,7 +1225,7 @@
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "c47b2b803f984a6db9278bb0ad5a575b",
+              "model_id": "1372481d201946c4acbf94652c511045",
               "version_major": 2,
               "version_minor": 0
             },
@@ -1223,7 +1277,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 34,
+      "execution_count": 33,
       "metadata": {
         "id": "WpHk6HW6O0FW"
       },
@@ -1231,7 +1285,7 @@
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "e8017c394a5a4798b8996f84d2e688b2",
+              "model_id": "c4dcc83680034beda20118390c3d7523",
               "version_major": 2,
               "version_minor": 0
             },
@@ -1245,7 +1299,7 @@
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "01ff88c15ecc458dba01b27665b327ca",
+              "model_id": "4aa99bb7a8fc4d6187585da05aaf5a6e",
               "version_major": 2,
               "version_minor": 0
             },
@@ -1269,7 +1323,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 35,
+      "execution_count": 34,
       "metadata": {
         "id": "R3xkR4_dPd3C"
       },
@@ -1301,11 +1355,111 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 35,
       "metadata": {
         "id": "DlJmwh-iei77"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo I 2024-07-22 20:47:39 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_quartznet15x5/versions/1.0.0rc1/files/stt_en_quartznet15x5.nemo to /root/.cache/torch/NeMo/NeMo_1.23.0/stt_en_quartznet15x5/16661021d16e679bdfd97a2a03944c49/stt_en_quartznet15x5.nemo\n",
+            "[NeMo I 2024-07-22 20:47:46 common:924] Instantiating model from pre-trained checkpoint\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "[NeMo W 2024-07-22 20:47:47 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
+            "    Train config : \n",
+            "    manifest_filepath: /data2/voices/train_1k.json\n",
+            "    sample_rate: 16000\n",
+            "    labels:\n",
+            "    - ' '\n",
+            "    - a\n",
+            "    - b\n",
+            "    - c\n",
+            "    - d\n",
+            "    - e\n",
+            "    - f\n",
+            "    - g\n",
+            "    - h\n",
+            "    - i\n",
+            "    - j\n",
+            "    - k\n",
+            "    - l\n",
+            "    - m\n",
+            "    - 'n'\n",
+            "    - o\n",
+            "    - p\n",
+            "    - q\n",
+            "    - r\n",
+            "    - s\n",
+            "    - t\n",
+            "    - u\n",
+            "    - v\n",
+            "    - w\n",
+            "    - x\n",
+            "    - 'y'\n",
+            "    - z\n",
+            "    - ''''\n",
+            "    batch_size: 32\n",
+            "    trim_silence: true\n",
+            "    max_duration: 16.7\n",
+            "    shuffle: true\n",
+            "    is_tarred: false\n",
+            "    tarred_audio_filepaths: /asr_set_1.2/train/train_{0..1023}.tar\n",
+            "    num_workers: 20\n",
+            "    \n",
+            "[NeMo W 2024-07-22 20:47:47 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
+            "    Validation config : \n",
+            "    manifest_filepath: /data2/voices/train_1k_samp.json\n",
+            "    sample_rate: 16000\n",
+            "    labels:\n",
+            "    - ' '\n",
+            "    - a\n",
+            "    - b\n",
+            "    - c\n",
+            "    - d\n",
+            "    - e\n",
+            "    - f\n",
+            "    - g\n",
+            "    - h\n",
+            "    - i\n",
+            "    - j\n",
+            "    - k\n",
+            "    - l\n",
+            "    - m\n",
+            "    - 'n'\n",
+            "    - o\n",
+            "    - p\n",
+            "    - q\n",
+            "    - r\n",
+            "    - s\n",
+            "    - t\n",
+            "    - u\n",
+            "    - v\n",
+            "    - w\n",
+            "    - x\n",
+            "    - 'y'\n",
+            "    - z\n",
+            "    - ''''\n",
+            "    batch_size: 32\n",
+            "    shuffle: false\n",
+            "    \n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[NeMo I 2024-07-22 20:47:47 features:289] PADDING: 16\n",
+            "[NeMo I 2024-07-22 20:47:47 save_restore_connector:249] Model EncDecCTCModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.23.0/stt_en_quartznet15x5/16661021d16e679bdfd97a2a03944c49/stt_en_quartznet15x5.nemo.\n"
+          ]
+        }
+      ],
       "source": [
         "char_model = nemo_asr.models.ASRModel.from_pretrained(\"stt_en_quartznet15x5\", map_location='cpu')"
       ]
@@ -1821,31 +1975,20 @@
         "Therefore, we will construct a sub-word tokenizer with vocabulary size exactly the same as the character encoding model plus add a few tokens required by SentencePiece required to perform tokenization. You can experiment with the effect of larger vocabularies by editing `VOCAB_SIZE` below."
       ]
     },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Making new tokenizer"
+      ]
+    },
     {
       "cell_type": "code",
-      "execution_count": 36,
+      "execution_count": 42,
       "metadata": {
         "id": "yIUQklly9BPa"
       },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "--2024-06-26 19:55:08--  https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tokenizers/process_asr_text_tokenizer.py\n",
-            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...\n",
-            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
-            "HTTP request sent, awaiting response... 200 OK\n",
-            "Length: 16631 (16K) [text/plain]\n",
-            "Saving to: ‘scripts/process_asr_text_tokenizer.py’\n",
-            "\n",
-            "process_asr_text_to 100%[===================>]  16.24K  --.-KB/s    in 0s      \n",
-            "\n",
-            "2024-06-26 19:55:08 (159 MB/s) - ‘scripts/process_asr_text_tokenizer.py’ saved [16631/16631]\n",
-            "\n"
-          ]
-        }
-      ],
+      "outputs": [],
       "source": [
         "if not os.path.exists(\"scripts/process_asr_text_tokenizer.py\"):\n",
         "  !wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tokenizers/process_asr_text_tokenizer.py"
@@ -1853,7 +1996,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 37,
+      "execution_count": 43,
       "metadata": {
         "id": "SKA9rrpbm3nu"
       },
@@ -1876,14 +2019,15 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 38,
+      "execution_count": 44,
       "metadata": {
         "id": "lO_uskUEm2ZG"
       },
       "outputs": [],
       "source": [
         "# << VOCAB SIZE can be changed to any value larger than (len(train_dev_set) + 2)! >>\n",
-        "VOCAB_SIZE = len(train_dev_set) + 2"
+        "# VOCAB_SIZE = len(train_dev_set) + 2\n",
+        "VOCAB_SIZE = 128"
       ]
     },
     {
@@ -1901,7 +2045,26 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 53,
+      "execution_count": 47,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "train_manifest_cleaned = \"/workspace/dataset/train_clean/manifest.json\"\n",
+        "dev_manifest_cleaned = \"/workspace/dataset/validation/manifest.json\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 50,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "tokenizer_dir = \"/workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 51,
       "metadata": {
         "id": "yT-SBPN2Ox6Y"
       },
@@ -1910,16 +2073,18 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "INFO:root:Corpus already exists at path : tokenizers/ja/text_corpus/document.txt\n",
-            "[NeMo I 2024-06-26 20:52:30 sentencepiece_tokenizer:316] Processing tokenizers/ja/text_corpus/document.txt and store at tokenizers/ja/tokenizer_spe_bpe_v1208\n",
-            "sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=tokenizers/ja/text_corpus/document.txt --model_prefix=tokenizers/ja/tokenizer_spe_bpe_v1208/tokenizer --vocab_size=1208 --shuffle_input_sentence=true --hard_vocab_limit=false --model_type=bpe --character_coverage=1.0 --bos_id=-1 --eos_id=-1\n",
+            "INFO:root:Finished extracting manifest : /workspace/dataset/train_clean/manifest.json\n",
+            "INFO:root:Finished extracting manifest : /workspace/dataset/validation/manifest.json\n",
+            "INFO:root:Finished extracting all manifests ! Number of sentences : 236954\n",
+            "[NeMo I 2024-07-18 18:51:58 sentencepiece_tokenizer:316] Processing /workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/text_corpus/document.txt and store at /workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/tokenizer_spe_bpe_v128\n",
+            "sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=/workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/text_corpus/document.txt --model_prefix=/workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/tokenizer_spe_bpe_v128/tokenizer --vocab_size=128 --shuffle_input_sentence=true --hard_vocab_limit=false --model_type=bpe --character_coverage=1.0 --bos_id=-1 --eos_id=-1\n",
             "sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : \n",
             "trainer_spec {\n",
-            "  input: tokenizers/ja/text_corpus/document.txt\n",
+            "  input: /workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/text_corpus/document.txt\n",
             "  input_format: \n",
-            "  model_prefix: tokenizers/ja/tokenizer_spe_bpe_v1208/tokenizer\n",
+            "  model_prefix: /workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/tokenizer_spe_bpe_v128/tokenizer\n",
             "  model_type: BPE\n",
-            "  vocab_size: 1208\n",
+            "  vocab_size: 128\n",
             "  self_test_sample_size: 0\n",
             "  character_coverage: 1\n",
             "  input_sentence_size: 0\n",
@@ -1966,19 +2131,24 @@
             "}\n",
             "denormalizer_spec {}\n",
             "trainer_interface.cc(353) LOG(INFO) SentenceIterator is not specified. Using MultiFileSentenceIterator.\n",
-            "trainer_interface.cc(185) LOG(INFO) Loading corpus: tokenizers/ja/text_corpus/document.txt\n",
-            "trainer_interface.cc(409) LOG(INFO) Loaded all 1308 sentences\n",
+            "trainer_interface.cc(185) LOG(INFO) Loading corpus: /workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/text_corpus/document.txt\n",
+            "trainer_interface.cc(409) LOG(INFO) Loaded all 236954 sentences\n",
             "trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <unk>\n",
             "trainer_interface.cc(430) LOG(INFO) Normalizing sentences...\n",
-            "trainer_interface.cc(539) LOG(INFO) all chars count=24630\n",
-            "trainer_interface.cc(560) LOG(INFO) Alphabet size=1207\n",
+            "trainer_interface.cc(539) LOG(INFO) all chars count=46492234\n",
+            "trainer_interface.cc(560) LOG(INFO) Alphabet size=38\n",
             "trainer_interface.cc(561) LOG(INFO) Final character coverage=1\n",
-            "trainer_interface.cc(592) LOG(INFO) Done! preprocessed 1308 sentences.\n",
-            "trainer_interface.cc(598) LOG(INFO) Tokenizing input sentences with whitespace: 1308\n",
-            "trainer_interface.cc(609) LOG(INFO) Done! 1308\n",
-            "trainer_interface.cc(687) LOG(INFO) Saving model: tokenizers/ja/tokenizer_spe_bpe_v1208/tokenizer.model\n",
-            "trainer_interface.cc(699) LOG(INFO) Saving vocabs: tokenizers/ja/tokenizer_spe_bpe_v1208/tokenizer.vocab\n",
-            "Serialized tokenizer at location : tokenizers/ja/tokenizer_spe_bpe_v1208\n",
+            "trainer_interface.cc(592) LOG(INFO) Done! preprocessed 236954 sentences.\n",
+            "trainer_interface.cc(598) LOG(INFO) Tokenizing input sentences with whitespace: 236954\n",
+            "trainer_interface.cc(609) LOG(INFO) Done! 48952\n",
+            "bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=1625104 min_freq=1\n",
+            "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=314950 size=20 all=1288 active=1250 piece=▁p\n",
+            "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=185513 size=40 all=2095 active=2057 piece=▁you\n",
+            "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=121485 size=60 all=3198 active=3160 piece=▁ha\n",
+            "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=72557 size=80 all=4263 active=4225 piece=se\n",
+            "trainer_interface.cc(687) LOG(INFO) Saving model: /workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/tokenizer_spe_bpe_v128/tokenizer.model\n",
+            "trainer_interface.cc(699) LOG(INFO) Saving vocabs: /workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/tokenizer_spe_bpe_v128/tokenizer.vocab\n",
+            "Serialized tokenizer at location : /workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/tokenizer_spe_bpe_v128\n",
             "INFO:root:Done!\n"
           ]
         }
@@ -1986,7 +2156,7 @@
       "source": [
         "!python scripts/process_asr_text_tokenizer.py \\\n",
         "  --manifest=$train_manifest_cleaned,$dev_manifest_cleaned \\\n",
-        "  --vocab_size=$VOCAB_SIZE \\\n",
+        "  --vocab_size=128 \\\n",
         "  --data_root=$tokenizer_dir \\\n",
         "  --tokenizer=\"spe\" \\\n",
         "  --spe_type=$TOKENIZER_TYPE \\\n",
@@ -1997,7 +2167,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 55,
+      "execution_count": 52,
       "metadata": {
         "id": "G5TxLHtKPW4E"
       },
@@ -2006,7 +2176,7 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "Tokenizer directory : tokenizers/ja/tokenizer_spe_bpe_v1208/\n"
+            "Tokenizer directory : /workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/tokenizer_spe_bpe_v228/\n"
           ]
         }
       ],
@@ -2017,23 +2187,265 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 56,
+      "execution_count": 79,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Tokens : '<N1>' '<N2>' '<N3>' '<N4>' '<N5>' '<N6>' '<N7>' '<N8>' '<N9>' '<N10>' '<N11>' '<N12>' '<N13>' '<N14>' '<N15>' '<N16>' '<N17>' '<N18>' '<N19>' '<N20>' '<N21>' '<N22>' '<N23>' '<N24>' '<N25>' '<N26>' '<N27>' '<N28>' '<N29>' '<N30>' '<N31>' '<N32>' '<N33>' '<N34>' '<N35>' '<N36>' '<N37>' '<N38>' '<N39>' '<N40>' '<N41>' '<N42>' '<N43>' '<N44>' '<N45>' '<N46>' '<N47>' '<N48>' '<N49>' '<N50>' '<N51>' '<N52>' '<N53>' '<N54>' '<N55>' '<N56>' '<N57>' '<N58>' '<N59>' '<N60>' '<N61>' '<N62>' '<N63>' '<N64>' '<N65>' '<N66>' '<N67>' '<N68>' '<N69>' '<N70>' '<N71>' '<N72>' '<N73>' '<N74>' '<N75>' '<N76>' '<N77>' '<N78>' '<N79>' '<N80>' '<N81>' '<N82>' '<N83>' '<N84>' '<N85>' '<N86>' '<N87>' '<N88>' '<N89>' '<N90>' '<N91>' '<N92>' '<N93>' '<N94>' '<N95>' '<N96>' '<N97>' '<N98>' '<N99>' '<N100>' '<N101>' '<N102>' '<N103>' '<N104>' '<N105>' '<N106>' '<N107>' '<N108>' '<N109>' '<N110>' '<N111>' '<N112>' '<N113>' '<N114>' '<N115>' '<N116>' '<N117>' '<N118>' '<N119>' '<N120>' '<N121>' '<N122>' '<N123>' '<N124>' '<N125>' '<N126>' '<N127>' '<N128>' '<N129>' '<N130>' '<N131>' '<N132>' '<N133>' '<N134>' '<N135>' '<N136>' '<N137>' '<N138>' '<N139>' '<N140>' '<N141>' '<N142>' '<N143>' '<N144>' '<N145>' '<N146>' '<N147>' '<N148>' '<N149>' '<N150>' '<N151>' '<N152>' '<N153>' '<N154>' '<N155>' '<N156>' '<N157>' '<N158>' '<N159>' '<N160>' '<N161>' '<N162>' '<N163>' '<N164>' '<N165>' '<N166>' '<N167>' '<N168>' '<N169>' '<N170>' '<N171>' '<N172>' '<N173>' '<N174>' '<N175>' '<N176>' '<N177>' '<N178>' '<N179>' '<N180>' '<N181>' '<N182>' '<N183>' '<N184>' '<N185>' '<N186>' '<N187>' '<N188>' '<N189>' '<N190>' '<N191>' '<N192>' '<N193>' '<N194>' '<N195>' '<N196>' '<N197>' '<N198>' '<N199>' '<N200>' '<N201>' '<N202>' '<N203>' '<N204>' '<N205>' '<N206>' '<N207>' '<N208>' '<N209>' '<N210>' '<N211>' '<N212>' '<N213>' '<N214>' '<N215>' '<N216>' '<N217>' '<N218>' '<N219>' '<N220>' '<N221>' '<N222>' '<N223>' '<N224>' '<N225>' '<N226>' '<N227>' '<N228>+'\n",
+            " \"<a1>\" \"<b2>\" \n"
+          ]
+        }
+      ],
+      "source": [
+        "# tokens is a list of <N1>, <N2>, ... <N128> tokens\n",
+        "tokens = [f\"<N{i}>\" for i in range(1, 228 + 1)] # Just added 228 tokens, you can add more\n",
+        "tokens_string = \"\"\"' '\"\"\".join(tokens)\n",
+        "tokens_string = f\"'{tokens_string}'\"\n",
+        "print(f\"Tokens : {tokens_string}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 80,
       "metadata": {},
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "INFO: Created token '<extra_id_0>' at ID 1208\n",
-            "INFO: Created token '<extra_id_1>' at ID 1209\n",
-            "INFO: Created token '<extra_id_2>' at ID 1210\n",
-            "INFO: Created token '<extra_id_3>' at ID 1211\n",
-            "INFO: Created token '<extra_id_4>' at ID 1212\n",
-            "INFO: Created token '<extra_id_5>' at ID 1213\n",
-            "INFO: Created token '<extra_id_6>' at ID 1214\n",
-            "INFO: Created token '<extra_id_7>' at ID 1215\n",
-            "INFO: New tokenizer vocab size: 1216\n",
-            "INFO: Created new tokenizer at: /workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/ja/tokenizer_spe_bpe_v1208/tokenizer_new.model\n"
+            "INFO: Created token '<N1>' at ID 128\n",
+            "INFO: Created token '<N2>' at ID 129\n",
+            "INFO: Created token '<N3>' at ID 130\n",
+            "INFO: Created token '<N4>' at ID 131\n",
+            "INFO: Created token '<N5>' at ID 132\n",
+            "INFO: Created token '<N6>' at ID 133\n",
+            "INFO: Created token '<N7>' at ID 134\n",
+            "INFO: Created token '<N8>' at ID 135\n",
+            "INFO: Created token '<N9>' at ID 136\n",
+            "INFO: Created token '<N10>' at ID 137\n",
+            "INFO: Created token '<N11>' at ID 138\n",
+            "INFO: Created token '<N12>' at ID 139\n",
+            "INFO: Created token '<N13>' at ID 140\n",
+            "INFO: Created token '<N14>' at ID 141\n",
+            "INFO: Created token '<N15>' at ID 142\n",
+            "INFO: Created token '<N16>' at ID 143\n",
+            "INFO: Created token '<N17>' at ID 144\n",
+            "INFO: Created token '<N18>' at ID 145\n",
+            "INFO: Created token '<N19>' at ID 146\n",
+            "INFO: Created token '<N20>' at ID 147\n",
+            "INFO: Created token '<N21>' at ID 148\n",
+            "INFO: Created token '<N22>' at ID 149\n",
+            "INFO: Created token '<N23>' at ID 150\n",
+            "INFO: Created token '<N24>' at ID 151\n",
+            "INFO: Created token '<N25>' at ID 152\n",
+            "INFO: Created token '<N26>' at ID 153\n",
+            "INFO: Created token '<N27>' at ID 154\n",
+            "INFO: Created token '<N28>' at ID 155\n",
+            "INFO: Created token '<N29>' at ID 156\n",
+            "INFO: Created token '<N30>' at ID 157\n",
+            "INFO: Created token '<N31>' at ID 158\n",
+            "INFO: Created token '<N32>' at ID 159\n",
+            "INFO: Created token '<N33>' at ID 160\n",
+            "INFO: Created token '<N34>' at ID 161\n",
+            "INFO: Created token '<N35>' at ID 162\n",
+            "INFO: Created token '<N36>' at ID 163\n",
+            "INFO: Created token '<N37>' at ID 164\n",
+            "INFO: Created token '<N38>' at ID 165\n",
+            "INFO: Created token '<N39>' at ID 166\n",
+            "INFO: Created token '<N40>' at ID 167\n",
+            "INFO: Created token '<N41>' at ID 168\n",
+            "INFO: Created token '<N42>' at ID 169\n",
+            "INFO: Created token '<N43>' at ID 170\n",
+            "INFO: Created token '<N44>' at ID 171\n",
+            "INFO: Created token '<N45>' at ID 172\n",
+            "INFO: Created token '<N46>' at ID 173\n",
+            "INFO: Created token '<N47>' at ID 174\n",
+            "INFO: Created token '<N48>' at ID 175\n",
+            "INFO: Created token '<N49>' at ID 176\n",
+            "INFO: Created token '<N50>' at ID 177\n",
+            "INFO: Created token '<N51>' at ID 178\n",
+            "INFO: Created token '<N52>' at ID 179\n",
+            "INFO: Created token '<N53>' at ID 180\n",
+            "INFO: Created token '<N54>' at ID 181\n",
+            "INFO: Created token '<N55>' at ID 182\n",
+            "INFO: Created token '<N56>' at ID 183\n",
+            "INFO: Created token '<N57>' at ID 184\n",
+            "INFO: Created token '<N58>' at ID 185\n",
+            "INFO: Created token '<N59>' at ID 186\n",
+            "INFO: Created token '<N60>' at ID 187\n",
+            "INFO: Created token '<N61>' at ID 188\n",
+            "INFO: Created token '<N62>' at ID 189\n",
+            "INFO: Created token '<N63>' at ID 190\n",
+            "INFO: Created token '<N64>' at ID 191\n",
+            "INFO: Created token '<N65>' at ID 192\n",
+            "INFO: Created token '<N66>' at ID 193\n",
+            "INFO: Created token '<N67>' at ID 194\n",
+            "INFO: Created token '<N68>' at ID 195\n",
+            "INFO: Created token '<N69>' at ID 196\n",
+            "INFO: Created token '<N70>' at ID 197\n",
+            "INFO: Created token '<N71>' at ID 198\n",
+            "INFO: Created token '<N72>' at ID 199\n",
+            "INFO: Created token '<N73>' at ID 200\n",
+            "INFO: Created token '<N74>' at ID 201\n",
+            "INFO: Created token '<N75>' at ID 202\n",
+            "INFO: Created token '<N76>' at ID 203\n",
+            "INFO: Created token '<N77>' at ID 204\n",
+            "INFO: Created token '<N78>' at ID 205\n",
+            "INFO: Created token '<N79>' at ID 206\n",
+            "INFO: Created token '<N80>' at ID 207\n",
+            "INFO: Created token '<N81>' at ID 208\n",
+            "INFO: Created token '<N82>' at ID 209\n",
+            "INFO: Created token '<N83>' at ID 210\n",
+            "INFO: Created token '<N84>' at ID 211\n",
+            "INFO: Created token '<N85>' at ID 212\n",
+            "INFO: Created token '<N86>' at ID 213\n",
+            "INFO: Created token '<N87>' at ID 214\n",
+            "INFO: Created token '<N88>' at ID 215\n",
+            "INFO: Created token '<N89>' at ID 216\n",
+            "INFO: Created token '<N90>' at ID 217\n",
+            "INFO: Created token '<N91>' at ID 218\n",
+            "INFO: Created token '<N92>' at ID 219\n",
+            "INFO: Created token '<N93>' at ID 220\n",
+            "INFO: Created token '<N94>' at ID 221\n",
+            "INFO: Created token '<N95>' at ID 222\n",
+            "INFO: Created token '<N96>' at ID 223\n",
+            "INFO: Created token '<N97>' at ID 224\n",
+            "INFO: Created token '<N98>' at ID 225\n",
+            "INFO: Created token '<N99>' at ID 226\n",
+            "INFO: Created token '<N100>' at ID 227\n",
+            "INFO: Created token '<N101>' at ID 228\n",
+            "INFO: Created token '<N102>' at ID 229\n",
+            "INFO: Created token '<N103>' at ID 230\n",
+            "INFO: Created token '<N104>' at ID 231\n",
+            "INFO: Created token '<N105>' at ID 232\n",
+            "INFO: Created token '<N106>' at ID 233\n",
+            "INFO: Created token '<N107>' at ID 234\n",
+            "INFO: Created token '<N108>' at ID 235\n",
+            "INFO: Created token '<N109>' at ID 236\n",
+            "INFO: Created token '<N110>' at ID 237\n",
+            "INFO: Created token '<N111>' at ID 238\n",
+            "INFO: Created token '<N112>' at ID 239\n",
+            "INFO: Created token '<N113>' at ID 240\n",
+            "INFO: Created token '<N114>' at ID 241\n",
+            "INFO: Created token '<N115>' at ID 242\n",
+            "INFO: Created token '<N116>' at ID 243\n",
+            "INFO: Created token '<N117>' at ID 244\n",
+            "INFO: Created token '<N118>' at ID 245\n",
+            "INFO: Created token '<N119>' at ID 246\n",
+            "INFO: Created token '<N120>' at ID 247\n",
+            "INFO: Created token '<N121>' at ID 248\n",
+            "INFO: Created token '<N122>' at ID 249\n",
+            "INFO: Created token '<N123>' at ID 250\n",
+            "INFO: Created token '<N124>' at ID 251\n",
+            "INFO: Created token '<N125>' at ID 252\n",
+            "INFO: Created token '<N126>' at ID 253\n",
+            "INFO: Created token '<N127>' at ID 254\n",
+            "INFO: Created token '<N128>' at ID 255\n",
+            "INFO: Created token '<N129>' at ID 256\n",
+            "INFO: Created token '<N130>' at ID 257\n",
+            "INFO: Created token '<N131>' at ID 258\n",
+            "INFO: Created token '<N132>' at ID 259\n",
+            "INFO: Created token '<N133>' at ID 260\n",
+            "INFO: Created token '<N134>' at ID 261\n",
+            "INFO: Created token '<N135>' at ID 262\n",
+            "INFO: Created token '<N136>' at ID 263\n",
+            "INFO: Created token '<N137>' at ID 264\n",
+            "INFO: Created token '<N138>' at ID 265\n",
+            "INFO: Created token '<N139>' at ID 266\n",
+            "INFO: Created token '<N140>' at ID 267\n",
+            "INFO: Created token '<N141>' at ID 268\n",
+            "INFO: Created token '<N142>' at ID 269\n",
+            "INFO: Created token '<N143>' at ID 270\n",
+            "INFO: Created token '<N144>' at ID 271\n",
+            "INFO: Created token '<N145>' at ID 272\n",
+            "INFO: Created token '<N146>' at ID 273\n",
+            "INFO: Created token '<N147>' at ID 274\n",
+            "INFO: Created token '<N148>' at ID 275\n",
+            "INFO: Created token '<N149>' at ID 276\n",
+            "INFO: Created token '<N150>' at ID 277\n",
+            "INFO: Created token '<N151>' at ID 278\n",
+            "INFO: Created token '<N152>' at ID 279\n",
+            "INFO: Created token '<N153>' at ID 280\n",
+            "INFO: Created token '<N154>' at ID 281\n",
+            "INFO: Created token '<N155>' at ID 282\n",
+            "INFO: Created token '<N156>' at ID 283\n",
+            "INFO: Created token '<N157>' at ID 284\n",
+            "INFO: Created token '<N158>' at ID 285\n",
+            "INFO: Created token '<N159>' at ID 286\n",
+            "INFO: Created token '<N160>' at ID 287\n",
+            "INFO: Created token '<N161>' at ID 288\n",
+            "INFO: Created token '<N162>' at ID 289\n",
+            "INFO: Created token '<N163>' at ID 290\n",
+            "INFO: Created token '<N164>' at ID 291\n",
+            "INFO: Created token '<N165>' at ID 292\n",
+            "INFO: Created token '<N166>' at ID 293\n",
+            "INFO: Created token '<N167>' at ID 294\n",
+            "INFO: Created token '<N168>' at ID 295\n",
+            "INFO: Created token '<N169>' at ID 296\n",
+            "INFO: Created token '<N170>' at ID 297\n",
+            "INFO: Created token '<N171>' at ID 298\n",
+            "INFO: Created token '<N172>' at ID 299\n",
+            "INFO: Created token '<N173>' at ID 300\n",
+            "INFO: Created token '<N174>' at ID 301\n",
+            "INFO: Created token '<N175>' at ID 302\n",
+            "INFO: Created token '<N176>' at ID 303\n",
+            "INFO: Created token '<N177>' at ID 304\n",
+            "INFO: Created token '<N178>' at ID 305\n",
+            "INFO: Created token '<N179>' at ID 306\n",
+            "INFO: Created token '<N180>' at ID 307\n",
+            "INFO: Created token '<N181>' at ID 308\n",
+            "INFO: Created token '<N182>' at ID 309\n",
+            "INFO: Created token '<N183>' at ID 310\n",
+            "INFO: Created token '<N184>' at ID 311\n",
+            "INFO: Created token '<N185>' at ID 312\n",
+            "INFO: Created token '<N186>' at ID 313\n",
+            "INFO: Created token '<N187>' at ID 314\n",
+            "INFO: Created token '<N188>' at ID 315\n",
+            "INFO: Created token '<N189>' at ID 316\n",
+            "INFO: Created token '<N190>' at ID 317\n",
+            "INFO: Created token '<N191>' at ID 318\n",
+            "INFO: Created token '<N192>' at ID 319\n",
+            "INFO: Created token '<N193>' at ID 320\n",
+            "INFO: Created token '<N194>' at ID 321\n",
+            "INFO: Created token '<N195>' at ID 322\n",
+            "INFO: Created token '<N196>' at ID 323\n",
+            "INFO: Created token '<N197>' at ID 324\n",
+            "INFO: Created token '<N198>' at ID 325\n",
+            "INFO: Created token '<N199>' at ID 326\n",
+            "INFO: Created token '<N200>' at ID 327\n",
+            "INFO: Created token '<N201>' at ID 328\n",
+            "INFO: Created token '<N202>' at ID 329\n",
+            "INFO: Created token '<N203>' at ID 330\n",
+            "INFO: Created token '<N204>' at ID 331\n",
+            "INFO: Created token '<N205>' at ID 332\n",
+            "INFO: Created token '<N206>' at ID 333\n",
+            "INFO: Created token '<N207>' at ID 334\n",
+            "INFO: Created token '<N208>' at ID 335\n",
+            "INFO: Created token '<N209>' at ID 336\n",
+            "INFO: Created token '<N210>' at ID 337\n",
+            "INFO: Created token '<N211>' at ID 338\n",
+            "INFO: Created token '<N212>' at ID 339\n",
+            "INFO: Created token '<N213>' at ID 340\n",
+            "INFO: Created token '<N214>' at ID 341\n",
+            "INFO: Created token '<N215>' at ID 342\n",
+            "INFO: Created token '<N216>' at ID 343\n",
+            "INFO: Created token '<N217>' at ID 344\n",
+            "INFO: Created token '<N218>' at ID 345\n",
+            "INFO: Created token '<N219>' at ID 346\n",
+            "INFO: Created token '<N220>' at ID 347\n",
+            "INFO: Created token '<N221>' at ID 348\n",
+            "INFO: Created token '<N222>' at ID 349\n",
+            "INFO: Created token '<N223>' at ID 350\n",
+            "INFO: Created token '<N224>' at ID 351\n",
+            "INFO: Created token '<N225>' at ID 352\n",
+            "INFO: Created token '<N226>' at ID 353\n",
+            "INFO: Created token '<N227>' at ID 354\n",
+            "INFO: Created token '<N228>+' at ID 355\n",
+            "INFO: New tokenizer vocab size: 356\n",
+            "INFO: Created new tokenizer at: /workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/tokenizer_spe_bpe_v128/tokenizer_new.model\n"
           ]
         }
       ],
@@ -2041,29 +2453,58 @@
         "# ! protoc --python_out=/workspace/nemo/NeMo-opensource/scripts/tokenizers/ sentencepiece_model.proto\n",
         "\n",
         "!python /workspace/nemo/NeMo-opensource/scripts/tokenizers/add_special_tokens_to_sentencepiece.py \\\n",
-        "--input_file /workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/ja/tokenizer_spe_bpe_v1208/tokenizer.model \\\n",
-        "--output_file /workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/ja/tokenizer_spe_bpe_v1208/tokenizer_new.model\\\n",
+        "--input_file /workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/tokenizer_spe_bpe_v128/tokenizer.model \\\n",
+        "--output_file /workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/tokenizer_spe_bpe_v128/tokenizer_new.model\\\n",
         "--is_userdefined \\\n",
-        "--tokens \"<extra_id_0>\" \"<extra_id_1>\" \"<extra_id_2>\" \"<extra_id_3>\" \\\n",
-        "         \"<extra_id_4>\" \"<extra_id_5>\" \"<extra_id_6>\" \"<extra_id_7>\""
+        "--tokens $tokens_string"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 57,
+      "execution_count": 81,
       "metadata": {},
       "outputs": [],
       "source": [
         "import sentencepiece as spm\n",
         "\n",
         "sp = spm.SentencePieceProcessor()\n",
-        "sp.load('/workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/ja/tokenizer_spe_bpe_v1208/tokenizer_new.model')\n",
+        "sp.load('/workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/tokenizer_spe_bpe_v128/tokenizer_new.model')\n",
         "\n",
         "vocab_list = [sp.id_to_piece(i) for i in range(sp.get_piece_size())]\n",
         "# Save the vocabulary to a file\n",
-        "# with open('new_tokenizer.vocab', 'w') as vocab_file:\n",
-        "#     for token in vocab_list:\n",
-        "#         vocab_file.write(token + '\\n')\n"
+        "with open('new_tokenizer.vocab', 'w') as vocab_file:\n",
+        "    for token in vocab_list:\n",
+        "        vocab_file.write(token + '\\n')\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 84,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Updating vocab.txt\n",
+        "vocab_file = '/workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/tokenizer_spe_bpe_v128/vocab.txt'\n",
+        "new_vocab_file = '/workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/tokenizer_spe_bpe_v128/new_vocab.txt'\n",
+        "# read the existing vocab file and add <N1>, <N2>, ... <N128> tokens\n",
+        "with open(vocab_file, 'r') as f:\n",
+        "    lines = f.readlines()\n",
+        "\n",
+        "# Add the new tokens to the vocab file\n",
+        "with open(new_vocab_file, 'w') as f:\n",
+        "    for line in lines:\n",
+        "        f.write(line)\n",
+        "    for token in tokens:\n",
+        "        f.write(token + '\\n')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 59,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!cp new_tokenizer.vocab /workspace/nemo/NeMo-opensource/tutorials/asr/tokenizers/av_tokenizer/tokenizer_spe_bpe_v128/"
       ]
     },
     {
@@ -2079,7 +2520,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 52,
+      "execution_count": 61,
       "metadata": {
         "id": "8sAz2_RyMu7J"
       },
@@ -2088,7 +2529,7 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "Number of tokens :  1208\n"
+            "Number of tokens :  128\n"
           ]
         }
       ],
@@ -2103,11 +2544,19 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 62,
       "metadata": {
         "id": "zktPYPCxNXNO"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "The text in this dataset is too small to construct a tokenizer with vocab size = 228. Current number of tokens = 128. Please reconstruct the tokenizer with fewer tokens\n"
+          ]
+        }
+      ],
       "source": [
         "if num_tokens < VOCAB_SIZE:\n",
         "    print(\n",
diff --git a/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb b/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
index 94b3897b6256..629a45756d52 100644
--- a/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
+++ b/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
@@ -41,7 +41,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 1,
+      "execution_count": 2,
       "metadata": {},
       "outputs": [
         {
@@ -75,7 +75,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 2,
+      "execution_count": 3,
       "metadata": {},
       "outputs": [
         {
@@ -374,9 +374,9 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "[NeMo I 2024-07-08 16:47:54 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.\n",
-            "[NeMo I 2024-07-08 16:47:54 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo\n",
-            "[NeMo I 2024-07-08 16:47:54 common:815] Instantiating model from pre-trained checkpoint\n"
+            "[NeMo I 2024-07-22 20:42:32 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.\n",
+            "[NeMo I 2024-07-22 20:42:32 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo\n",
+            "[NeMo I 2024-07-22 20:42:32 common:815] Instantiating model from pre-trained checkpoint\n"
           ]
         }
       ],
@@ -452,17 +452,17 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "[NeMo I 2024-07-08 16:47:54 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.\n",
-            "[NeMo I 2024-07-08 16:47:54 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo\n",
-            "[NeMo I 2024-07-08 16:47:54 common:815] Instantiating model from pre-trained checkpoint\n",
-            "[NeMo I 2024-07-08 16:47:55 mixins:172] Tokenizer SentencePieceTokenizer initialized with 128 tokens\n"
+            "[NeMo I 2024-07-22 20:42:32 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.\n",
+            "[NeMo I 2024-07-22 20:42:32 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo\n",
+            "[NeMo I 2024-07-22 20:42:32 common:815] Instantiating model from pre-trained checkpoint\n",
+            "[NeMo I 2024-07-22 20:42:33 mixins:172] Tokenizer SentencePieceTokenizer initialized with 128 tokens\n"
           ]
         },
         {
           "name": "stderr",
           "output_type": "stream",
           "text": [
-            "[NeMo W 2024-07-08 16:47:55 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
+            "[NeMo W 2024-07-22 20:42:33 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
             "    Train config : \n",
             "    manifest_filepath:\n",
             "    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/tarred_audio_manifest.json\n",
@@ -504,7 +504,7 @@
             "    - 12\n",
             "    - 8\n",
             "    \n",
-            "[NeMo W 2024-07-08 16:47:55 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
+            "[NeMo W 2024-07-22 20:42:33 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
             "    Validation config : \n",
             "    manifest_filepath:\n",
             "    - /manifests/librispeech/librivox-dev-other.json\n",
@@ -518,7 +518,7 @@
             "    pin_memory: true\n",
             "    use_start_end_token: false\n",
             "    \n",
-            "[NeMo W 2024-07-08 16:47:55 modelPT:189] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a valid configuration file to setup the test data loader(s).\n",
+            "[NeMo W 2024-07-22 20:42:33 modelPT:189] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a valid configuration file to setup the test data loader(s).\n",
             "    Test config : \n",
             "    manifest_filepath:\n",
             "    - /manifests/librispeech/librivox-dev-other.json\n",
@@ -538,8 +538,8 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "[NeMo I 2024-07-08 16:47:55 features:305] PADDING: 0\n",
-            "[NeMo I 2024-07-08 16:47:57 save_restore_connector:263] Model EncDecCTCModelBPE was successfully restored from /root/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.\n"
+            "[NeMo I 2024-07-22 20:42:33 features:305] PADDING: 0\n",
+            "[NeMo I 2024-07-22 20:42:34 save_restore_connector:263] Model EncDecCTCModelBPE was successfully restored from /root/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.\n"
           ]
         }
       ],
@@ -577,11 +577,11 @@
           "text": [
             "INFO: GPU available: True (cuda), used: True\n",
             "WARNING: Logging before flag parsing goes to stderr.\n",
-            "I0708 16:47:57.231223 129166743029568 rank_zero.py:64] GPU available: True (cuda), used: True\n",
+            "I0722 20:42:51.693691 129733185169216 rank_zero.py:64] GPU available: True (cuda), used: True\n",
             "INFO: TPU available: False, using: 0 TPU cores\n",
-            "I0708 16:47:57.256626 129166743029568 rank_zero.py:64] TPU available: False, using: 0 TPU cores\n",
+            "I0722 20:42:51.715736 129733185169216 rank_zero.py:64] TPU available: False, using: 0 TPU cores\n",
             "INFO: HPU available: False, using: 0 HPUs\n",
-            "I0708 16:47:57.257805 129166743029568 rank_zero.py:64] HPU available: False, using: 0 HPUs\n"
+            "I0722 20:42:51.716401 129733185169216 rank_zero.py:64] HPU available: False, using: 0 HPUs\n"
           ]
         }
       ],
@@ -848,7 +848,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 16,
+      "execution_count": 47,
       "metadata": {
         "id": "F0GIxhyCJmFv"
       },
@@ -857,12 +857,12 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "[NeMo I 2024-07-08 16:50:44 collections:199] Dataset loaded with 948 files totalling 0.71 hours\n",
-            "[NeMo I 2024-07-08 16:50:44 collections:201] 0 files were filtered totalling 0.00 hours\n",
-            "[NeMo I 2024-07-08 16:50:44 collections:199] Dataset loaded with 130 files totalling 0.10 hours\n",
-            "[NeMo I 2024-07-08 16:50:44 collections:201] 0 files were filtered totalling 0.00 hours\n",
-            "[NeMo I 2024-07-08 16:50:44 collections:199] Dataset loaded with 130 files totalling 0.10 hours\n",
-            "[NeMo I 2024-07-08 16:50:44 collections:201] 0 files were filtered totalling 0.00 hours\n"
+            "[NeMo I 2024-07-19 09:32:48 collections:199] Dataset loaded with 948 files totalling 0.71 hours\n",
+            "[NeMo I 2024-07-19 09:32:48 collections:201] 0 files were filtered totalling 0.00 hours\n",
+            "[NeMo I 2024-07-19 09:32:48 collections:199] Dataset loaded with 130 files totalling 0.10 hours\n",
+            "[NeMo I 2024-07-19 09:32:48 collections:201] 0 files were filtered totalling 0.00 hours\n",
+            "[NeMo I 2024-07-19 09:32:48 collections:199] Dataset loaded with 130 files totalling 0.10 hours\n",
+            "[NeMo I 2024-07-19 09:32:48 collections:201] 0 files were filtered totalling 0.00 hours\n"
           ]
         }
       ],
@@ -897,7 +897,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 18,
+      "execution_count": 48,
       "metadata": {},
       "outputs": [
         {
@@ -932,7 +932,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 17,
+      "execution_count": 49,
       "metadata": {
         "id": "T-XFuaA3OlOB"
       },
@@ -967,7 +967,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 18,
+      "execution_count": 50,
       "metadata": {
         "id": "UDEIfMTcP6j6"
       },
@@ -981,10 +981,10 @@
             "betas:\n",
             "- 0.9\n",
             "- 0.98\n",
-            "weight_decay: 0\n",
+            "weight_decay: 0.001\n",
             "sched:\n",
             "  name: NoamAnnealing\n",
-            "  d_model: 176\n",
+            "  d_model: 512\n",
             "  warmup_steps: 10000\n",
             "  warmup_ratio: null\n",
             "  min_lr: 1.0e-06\n",
@@ -999,7 +999,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 19,
+      "execution_count": 51,
       "metadata": {
         "id": "tp_8FGPcKjMd"
       },
@@ -1008,7 +1008,7 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "[NeMo I 2024-07-08 10:01:14 modelPT:767] Optimizer config = AdamW (\n",
+            "[NeMo I 2024-07-19 09:32:52 modelPT:767] Optimizer config = AdamW (\n",
             "    Parameter Group 0\n",
             "        amsgrad: False\n",
             "        betas: [0.9, 0.98]\n",
@@ -1021,10 +1021,10 @@
             "        maximize: False\n",
             "        weight_decay: 0.0\n",
             "    )\n",
-            "[NeMo I 2024-07-08 10:01:14 lr_scheduler:923] Scheduler \"<nemo.core.optim.lr_scheduler.NoamAnnealing object at 0x7549a4f86680>\" \n",
+            "[NeMo I 2024-07-19 09:32:52 lr_scheduler:923] Scheduler \"<nemo.core.optim.lr_scheduler.NoamAnnealing object at 0x7eb68c4f1210>\" \n",
             "    will be used during training (effective maximum steps = 300) - \n",
             "    Parameters : \n",
-            "    (d_model: 176\n",
+            "    (d_model: 512\n",
             "    warmup_steps: 100\n",
             "    warmup_ratio: null\n",
             "    min_lr: 1.0e-06\n",
@@ -1069,7 +1069,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 43,
+      "execution_count": 53,
       "metadata": {
         "id": "fRIDhU8RVBwi"
       },
@@ -1099,7 +1099,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 42,
+      "execution_count": 54,
       "metadata": {
         "id": "iNnSp_azQ2u8"
       },
@@ -1108,10 +1108,10 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "<generator object Module.children at 0x7549aa446260>\n",
+            "<generator object Module.children at 0x7eb68d26be60>\n",
             "Module :  ConformerEncoderAdapter\n",
-            "<class 'nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.RelPositionMultiHeadAttentionAdapter'>\n",
             "<class 'nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.MultiHeadAttentionAdapter'>\n",
+            "<class 'nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module.RelPositionMultiHeadAttentionAdapter'>\n",
             "<class 'nemo.collections.common.parts.adapter_modules.LinearAdapter'>\n",
             "\n",
             "Module :  ConvASRDecoder\n",
@@ -1162,18 +1162,20 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 22,
+      "execution_count": 55,
       "metadata": {
         "id": "oZZr6vSntuyX"
       },
       "outputs": [],
       "source": [
-        "from nemo.collections.common.parts.adapter_modules import LinearAdapterConfig"
+        "from nemo.collections.common.parts.adapter_modules import LinearAdapterConfig\n",
+        "from nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module import MultiHeadAttentionAdapterConfig\n",
+        "from nemo.collections.asr.parts.submodules.adapters.multi_head_attention_adapter_module import RelPositionMultiHeadAttentionAdapterConfig"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 23,
+      "execution_count": 58,
       "metadata": {
         "id": "dlj0Yud4MxOi"
       },
@@ -1182,14 +1184,14 @@
         "#%% [code]\n",
         "#@title Adapter Setup { display-mode: \"form\" }\n",
         "adapter_name = \"AN4\" #@param {type:\"string\"}\n",
-        "adapter_dim = 32 #@param {type:\"integer\"}\n",
+        "adapter_dim = 64 #@param {type:\"integer\"}\n",
         "adapter_activation = \"swish\" #@param {type:\"string\"}\n",
         "adapter_norm_position = \"pre\" #@param [\"pre\", \"post\"]"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 24,
+      "execution_count": 59,
       "metadata": {
         "id": "Uv8WRQkXU3mu"
       },
@@ -1198,7 +1200,7 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "LinearAdapterConfig(in_features=176, dim=32, activation='swish', norm_position='pre', dropout=0.0, adapter_strategy=ResidualAddAdapterStrategyConfig(stochastic_depth=0.0, l2_lambda=0.0, _target_='nemo.core.classes.mixins.adapter_mixin_strategies.ResidualAddAdapterStrategy'), _target_='nemo.collections.common.parts.adapter_modules.LinearAdapter')\n"
+            "LinearAdapterConfig(in_features=512, dim=64, activation='swish', norm_position='pre', dropout=0.0, adapter_strategy=ResidualAddAdapterStrategyConfig(stochastic_depth=0.0, l2_lambda=0.0, _target_='nemo.core.classes.mixins.adapter_mixin_strategies.ResidualAddAdapterStrategy'), _target_='nemo.collections.common.parts.adapter_modules.LinearAdapter')\n"
           ]
         }
       ],
@@ -1225,7 +1227,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 25,
+      "execution_count": 60,
       "metadata": {
         "id": "-MbSTbYiYtnB"
       },
@@ -1236,19 +1238,19 @@
               "  | Name              | Type                              | Params | Mode \n",
               "--------------------------------------------------------------------------------\n",
               "0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0      | train\n",
-              "1 | encoder           | ConformerEncoderAdapter           | 13.0 M | train\n",
-              "2 | decoder           | ConvASRDecoder                    | 181 K  | train\n",
+              "1 | encoder           | ConformerEncoderAdapter           | 121 M  | train\n",
+              "2 | decoder           | ConvASRDecoder                    | 66.2 K | train\n",
               "3 | loss              | CTCLoss                           | 0      | train\n",
               "4 | spec_augmentation | SpectrogramAugmentation           | 0      | train\n",
               "5 | wer               | WER                               | 0      | train\n",
               "--------------------------------------------------------------------------------\n",
-              "13.2 M    Trainable params\n",
+              "121 M     Trainable params\n",
               "0         Non-trainable params\n",
-              "13.2 M    Total params\n",
-              "52.616    Total estimated model params size (MB)"
+              "121 M     Total params\n",
+              "486.005   Total estimated model params size (MB)"
             ]
           },
-          "execution_count": 25,
+          "execution_count": 60,
           "metadata": {},
           "output_type": "execute_result"
         }
@@ -1272,7 +1274,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 26,
+      "execution_count": 61,
       "metadata": {
         "id": "El6ewd1GX9V7"
       },
@@ -1294,7 +1296,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 27,
+      "execution_count": 62,
       "metadata": {
         "id": "rIvw0_8iYpHW"
       },
@@ -1305,19 +1307,19 @@
               "  | Name              | Type                              | Params | Mode \n",
               "--------------------------------------------------------------------------------\n",
               "0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0      | train\n",
-              "1 | encoder           | ConformerEncoderAdapter           | 13.2 M | train\n",
-              "2 | decoder           | ConvASRDecoder                    | 181 K  | train\n",
+              "1 | encoder           | ConformerEncoderAdapter           | 122 M  | train\n",
+              "2 | decoder           | ConvASRDecoder                    | 66.2 K | train\n",
               "3 | loss              | CTCLoss                           | 0      | train\n",
               "4 | spec_augmentation | SpectrogramAugmentation           | 0      | train\n",
               "5 | wer               | WER                               | 0      | train\n",
               "--------------------------------------------------------------------------------\n",
-              "13.3 M    Trainable params\n",
+              "122 M     Trainable params\n",
               "0         Non-trainable params\n",
-              "13.3 M    Total params\n",
-              "53.360    Total estimated model params size (MB)"
+              "122 M     Total params\n",
+              "490.798   Total estimated model params size (MB)"
             ]
           },
-          "execution_count": 27,
+          "execution_count": 62,
           "metadata": {},
           "output_type": "execute_result"
         }
@@ -1343,7 +1345,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 28,
+      "execution_count": 63,
       "metadata": {
         "id": "ogUfDkjdZKHu"
       },
@@ -1352,8 +1354,8 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "[NeMo I 2024-07-08 10:05:36 adapter_mixins:719] Setting adapter 'AN4' status : Enabled = False\n",
-            "[NeMo I 2024-07-08 10:05:36 adapter_mixins:734] Setting adapter 'AN4' status : Enabled = True\n"
+            "[NeMo I 2024-07-19 09:39:28 adapter_mixins:719] Setting adapter 'AN4' status : Enabled = False\n",
+            "[NeMo I 2024-07-19 09:39:28 adapter_mixins:734] Setting adapter 'AN4' status : Enabled = True\n"
           ]
         }
       ],
@@ -1379,7 +1381,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 29,
+      "execution_count": 64,
       "metadata": {
         "id": "RN2YayAoYzaI"
       },
@@ -1388,23 +1390,25 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.0.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.1.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.2.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.3.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.4.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.5.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.6.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.7.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.8.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.9.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.10.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.11.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.12.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.13.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.14.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:405] Froze module encoder.layers.15.conv.batch_norm: BatchNorm1d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
-            "[NeMo I 2024-07-08 10:05:38 adapter_mixins:435] Unfrozen adapter : AN4\n"
+            "[NeMo I 2024-07-19 09:39:34 adapter_mixins:405] Froze module encoder.layers.0.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-19 09:39:34 adapter_mixins:405] Froze module encoder.layers.1.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-19 09:39:34 adapter_mixins:405] Froze module encoder.layers.2.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-19 09:39:34 adapter_mixins:405] Froze module encoder.layers.3.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-19 09:39:34 adapter_mixins:405] Froze module encoder.layers.4.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-19 09:39:34 adapter_mixins:405] Froze module encoder.layers.5.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-19 09:39:34 adapter_mixins:405] Froze module encoder.layers.6.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-19 09:39:34 adapter_mixins:405] Froze module encoder.layers.7.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-19 09:39:34 adapter_mixins:405] Froze module encoder.layers.8.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-19 09:39:34 adapter_mixins:405] Froze module encoder.layers.9.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-19 09:39:34 adapter_mixins:405] Froze module encoder.layers.10.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-19 09:39:34 adapter_mixins:405] Froze module encoder.layers.11.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-19 09:39:34 adapter_mixins:405] Froze module encoder.layers.12.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-19 09:39:34 adapter_mixins:405] Froze module encoder.layers.13.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-19 09:39:34 adapter_mixins:405] Froze module encoder.layers.14.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-19 09:39:34 adapter_mixins:405] Froze module encoder.layers.15.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-19 09:39:34 adapter_mixins:405] Froze module encoder.layers.16.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-19 09:39:34 adapter_mixins:405] Froze module encoder.layers.17.conv.batch_norm: BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)\n",
+            "[NeMo I 2024-07-19 09:39:34 adapter_mixins:435] Unfrozen adapter : AN4\n"
           ]
         }
       ],
@@ -1433,7 +1437,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 30,
+      "execution_count": 65,
       "metadata": {
         "id": "Lf3pdwQ2Zch5"
       },
@@ -1444,19 +1448,19 @@
               "  | Name              | Type                              | Params | Mode\n",
               "-------------------------------------------------------------------------------\n",
               "0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0      | eval\n",
-              "1 | encoder           | ConformerEncoderAdapter           | 13.2 M | eval\n",
-              "2 | decoder           | ConvASRDecoder                    | 181 K  | eval\n",
+              "1 | encoder           | ConformerEncoderAdapter           | 122 M  | eval\n",
+              "2 | decoder           | ConvASRDecoder                    | 66.2 K | eval\n",
               "3 | loss              | CTCLoss                           | 0      | eval\n",
               "4 | spec_augmentation | SpectrogramAugmentation           | 0      | eval\n",
               "5 | wer               | WER                               | 0      | eval\n",
               "-------------------------------------------------------------------------------\n",
-              "185 K     Trainable params\n",
-              "13.2 M    Non-trainable params\n",
-              "13.3 M    Total params\n",
-              "53.360    Total estimated model params size (MB)"
+              "1.2 M     Trainable params\n",
+              "121 M     Non-trainable params\n",
+              "122 M     Total params\n",
+              "490.798   Total estimated model params size (MB)"
             ]
           },
-          "execution_count": 30,
+          "execution_count": 65,
           "metadata": {},
           "output_type": "execute_result"
         }