-
- Notifications
You must be signed in to change notification settings - Fork 204
feat: add audio narration (updated) #346
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
351d87b f19a84a e143767 6f07b93 d3ef09a 9e86193 87a814f ce84a1b 5c584b2 802c8a2 aca8cdc 42b1007 9f4c280 20d29e1 109ffe0 8d27b4f d631b2d ab0805e e30538b 9469043 47bf845 e9f2d36 9293b0b a66acbc 888d335 e1a3a18 d7c54f2 3eaa3a8 05834c4 a6e45bd f23df51 f6cdbc0 873cf6d File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,47 @@ | ||
| """Add audio info. | ||
| | ||
| Revision ID: c176288cb508 | ||
| Revises: 8713b142f5de | ||
| Create Date: 2023-08-31 00:25:04.889325 | ||
| | ||
| """ | ||
| import sqlalchemy as sa | ||
| | ||
| from alembic import op | ||
| from openadapt.models import ForceFloat | ||
| | ||
| # revision identifiers, used by Alembic. | ||
| revision = "c176288cb508" | ||
| down_revision = "8713b142f5de" | ||
| branch_labels = None | ||
| depends_on = None | ||
| | ||
| | ||
| def upgrade() -> None: | ||
| # ### commands auto generated by Alembic - please adjust! ### | ||
| op.create_table( | ||
| "audio_info", | ||
| sa.Column("id", sa.Integer(), nullable=False), | ||
| sa.Column("flac_data", sa.LargeBinary(), nullable=True), | ||
| sa.Column("transcribed_text", sa.String(), nullable=True), | ||
| sa.Column( | ||
| "recording_timestamp", | ||
| ForceFloat(precision=10, scale=2, asdecimal=False), | ||
| nullable=True, | ||
| ), | ||
| sa.Column("sample_rate", sa.Integer(), nullable=True), | ||
| sa.Column("words_with_timestamps", sa.Text(), nullable=True), | ||
| sa.ForeignKeyConstraint( | ||
| ["recording_timestamp"], | ||
| ["recording.timestamp"], | ||
| name=op.f("fk_audio_info_recording_timestamp_recording"), | ||
| ), | ||
| sa.PrimaryKeyConstraint("id", name=op.f("pk_audio_info")), | ||
| ) | ||
| # ### end Alembic commands ### | ||
| | ||
| | ||
| def downgrade() -> None: | ||
| # ### commands auto generated by Alembic - please adjust! ### | ||
| op.drop_table("audio_info") | ||
| # ### end Alembic commands ### |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -4,11 +4,16 @@ | |
| | ||
| $ python openadapt/record.py "<description of task to be recorded>" | ||
| | ||
| To record audio: | ||
| | ||
| $ python openadapt/record.py "<description of task to be recorded>" --enable_audio | ||
| | ||
| """ | ||
| | ||
| from collections import namedtuple | ||
| from functools import partial, wraps | ||
| from typing import Any, Callable, Union | ||
| import io | ||
| import multiprocessing | ||
| import os | ||
| import queue | ||
| | @@ -24,7 +29,11 @@ | |
| from tqdm import tqdm | ||
| import fire | ||
| import mss.tools | ||
| import numpy as np | ||
| import psutil | ||
| import sounddevice | ||
| import soundfile | ||
| import whisper | ||
| | ||
| from openadapt import config, utils, window | ||
| from openadapt.db import crud | ||
| | @@ -804,15 +813,101 @@ def read_mouse_events( | |
| mouse_listener.stop() | ||
| | ||
| | ||
| def record_audio( | ||
| terminate_event: multiprocessing.Event, | ||
| recording_timestamp: float, | ||
| ) -> None: | ||
angelala3252 marked this conversation as resolved. Show resolved Hide resolved | ||
| """Record audio narration during the recording and store data in database. | ||
| | ||
| Args: | ||
| terminate_event: The event to signal termination of event reading. | ||
| recording_timestamp: The timestamp of the recording. | ||
| """ | ||
| utils.configure_logging(logger, LOG_LEVEL) | ||
| utils.set_start_time(recording_timestamp) | ||
| | ||
| audio_frames = [] # to store audio frames | ||
| | ||
| def audio_callback( | ||
| indata: np.ndarray, frames: int, time: Any, status: sounddevice.CallbackFlags | ||
| ) -> None: | ||
| """Callback function used when new audio frames are recorded. | ||
| | ||
| Note: time is of type cffi.FFI.CData, but since we don't use this argument | ||
| and we also don't use the cffi library, the Any type annotation is used. | ||
| """ | ||
| # called whenever there is new audio frames | ||
| audio_frames.append(indata.copy()) | ||
| | ||
| # open InputStream and start recording while ActionEvents are recorded | ||
| audio_stream = sounddevice.InputStream( | ||
| Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @angelala3252 @0dm what is the easiest way to implement a MacOS-compatible analog of this? Can we re-use existing code in other PRs? Collaborator There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. #362 has a good method of getting audio devices via Apple AVFoundation, I'm sure it can be used here with minimal issue. I'm not sure if it'll be plug & play with my PR though, would need some changes depending on the implementation. | ||
| callback=audio_callback, samplerate=16000, channels=1 | ||
| ) | ||
| logger.info("Audio recording started.") | ||
| audio_stream.start() | ||
| terminate_event.wait() | ||
| audio_stream.stop() | ||
| audio_stream.close() | ||
| | ||
| # Concatenate into one Numpy array | ||
| concatenated_audio = np.concatenate(audio_frames, axis=0) | ||
| # convert concatenated_audio to format expected by whisper | ||
| converted_audio = concatenated_audio.flatten().astype(np.float32) | ||
| | ||
| # Convert audio to text using OpenAI's Whisper | ||
| logger.info("Transcribing audio...") | ||
| model = whisper.load_model("base") | ||
| result_info = model.transcribe(converted_audio, word_timestamps=True, fp16=False) | ||
| logger.info(f"The narrated text is: {result_info['text']}") | ||
| # empty word_list if the user didn't say anything | ||
| word_list = [] | ||
| # segments could be empty | ||
| if len(result_info["segments"]) > 0: | ||
| # there won't be a 'words' list if the user didn't say anything | ||
| if "words" in result_info["segments"][0]: | ||
| word_list = result_info["segments"][0]["words"] | ||
| | ||
| # compress and convert to bytes to save to database | ||
| logger.info( | ||
| "Size of uncompressed audio data: {} bytes".format(converted_audio.nbytes) | ||
| ) | ||
| # Create an in-memory file-like object | ||
| file_obj = io.BytesIO() | ||
| # Write the audio data using lossless compression | ||
| soundfile.write( | ||
| file_obj, converted_audio, int(audio_stream.samplerate), format="FLAC" | ||
| ) | ||
| # Get the compressed audio data as bytes | ||
| compressed_audio_bytes = file_obj.getvalue() | ||
| | ||
| logger.info( | ||
| "Size of compressed audio data: {} bytes".format(len(compressed_audio_bytes)) | ||
| ) | ||
| | ||
| file_obj.close() | ||
| | ||
| # To decompress the audio and restore it to its original form: | ||
| # restored_audio, restored_samplerate = sf.read( | ||
| # io.BytesIO(compressed_audio_bytes)) | ||
| | ||
| # Create AudioInfo entry | ||
| crud.insert_audio_info( | ||
| compressed_audio_bytes, | ||
| result_info["text"], | ||
| recording_timestamp, | ||
| int(audio_stream.samplerate), | ||
| word_list, | ||
| ) | ||
| | ||
| | ||
| @logger.catch | ||
| @trace(logger) | ||
| def record( | ||
| task_description: str, | ||
| ) -> None: | ||
| def record(task_description: str, enable_audio: bool = False) -> None: | ||
| """Record Screenshots/ActionEvents/WindowEvents. | ||
| | ||
| Args: | ||
| task_description: A text description of the task to be recorded. | ||
| enable_audio: a flag to enable or disable audio recording (default: False) | ||
| """ | ||
| logger.info(f"{task_description=}") | ||
| | ||
| | @@ -943,6 +1038,13 @@ def record( | |
| ) | ||
| mem_plotter.start() | ||
| | ||
| if enable_audio: | ||
| audio_recorder = threading.Thread( | ||
| target=record_audio, | ||
| args=(terminate_event, recording_timestamp), | ||
| ) | ||
| audio_recorder.start() | ||
| | ||
| # TODO: discard events until everything is ready | ||
| | ||
| collect_stats() | ||
| | @@ -972,6 +1074,9 @@ def record( | |
| screen_event_writer.join() | ||
| action_event_writer.join() | ||
| window_event_writer.join() | ||
| if enable_audio: | ||
| audio_recorder.join() | ||
| | ||
| terminate_perf_event.set() | ||
| | ||
| if PLOT_PERFORMANCE: | ||
| | ||
Uh oh!
There was an error while loading. Please reload this page.