OpenAdaptAI · angelala3252 · May 26, 2023 · May 27, 2023 · May 29, 2023 · May 31, 2023
diff --git a/alembic/versions/c176288cb508_add_audio_info.py b/alembic/versions/c176288cb508_add_audio_info.py
@@ -0,0 +1,47 @@
+"""Add audio info.
+
+Revision ID: c176288cb508
+Revises: 8713b142f5de
+Create Date: 2023-08-31 00:25:04.889325
+
+"""
+import sqlalchemy as sa
+
+from alembic import op
+from openadapt.models import ForceFloat
+
+# revision identifiers, used by Alembic.
+revision = "c176288cb508"
+down_revision = "8713b142f5de"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+ # ### commands auto generated by Alembic - please adjust! ###
+ op.create_table(
+ "audio_info",
+ sa.Column("id", sa.Integer(), nullable=False),
+ sa.Column("flac_data", sa.LargeBinary(), nullable=True),
+ sa.Column("transcribed_text", sa.String(), nullable=True),
+ sa.Column(
+ "recording_timestamp",
+ ForceFloat(precision=10, scale=2, asdecimal=False),
+ nullable=True,
+ ),
+ sa.Column("sample_rate", sa.Integer(), nullable=True),
+ sa.Column("words_with_timestamps", sa.Text(), nullable=True),
+ sa.ForeignKeyConstraint(
+ ["recording_timestamp"],
+ ["recording.timestamp"],
+ name=op.f("fk_audio_info_recording_timestamp_recording"),
+ ),
+ sa.PrimaryKeyConstraint("id", name=op.f("pk_audio_info")),
+ )
+ # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+ # ### commands auto generated by Alembic - please adjust! ###
+ op.drop_table("audio_info")
+ # ### end Alembic commands ###
diff --git a/openadapt/crud.py b/openadapt/crud.py
diff --git a/openadapt/db/crud.py b/openadapt/db/crud.py
@@ -4,6 +4,7 @@
 """
 
 from typing import Any
+import json
 
 from loguru import logger
 import sqlalchemy as sa
@@ -12,6 +13,7 @@
 from openadapt.db.db import BaseModel, Session
 from openadapt.models import (
  ActionEvent,
+ AudioInfo,
  MemoryStat,
  PerformanceStat,
  Recording,
@@ -411,3 +413,22 @@ def new_session() -> None:
  if db:
  db.close()
  db = Session()
+
+
+def insert_audio_info(
+ audio_data: bytes,
+ transcribed_text: str,
+ recording_timestamp: float,
+ sample_rate: int,
+ word_list: list,
+) -> None:
+ """Create an AudioInfo entry in the database."""
+ audio_info = AudioInfo(
+ flac_data=audio_data,
+ transcribed_text=transcribed_text,
+ recording_timestamp=recording_timestamp,
+ sample_rate=sample_rate,
+ words_with_timestamps=json.dumps(word_list),
+ )
+ db.add(audio_info)
+ db.commit()
diff --git a/openadapt/db/db.py b/openadapt/db/db.py
@@ -5,7 +5,7 @@
 
 from dictalchemy import DictableModel
 from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy.orm import sessionmaker
+from sqlalchemy.orm import scoped_session, sessionmaker
 from sqlalchemy.schema import MetaData
 import sqlalchemy as sa
 
@@ -67,4 +67,5 @@ def get_base(engine: sa.engine) -> sa.engine:
 
 engine = get_engine()
 Base = get_base(engine)
-Session = sessionmaker(bind=engine)
+session_factory = sessionmaker(bind=engine)
+Session = scoped_session(session_factory)
diff --git a/openadapt/models.py b/openadapt/models.py
@@ -61,6 +61,8 @@ class Recording(db.Base):
  order_by="WindowEvent.timestamp",
  )
 
+ audio_info = sa.orm.relationship("AudioInfo", back_populates="recording")
+
  _processed_action_events = None
 
  @property
@@ -378,6 +380,21 @@ def get_active_window_event(cls: "WindowEvent") -> "WindowEvent":
  return WindowEvent(**window.get_active_window_data())
 
 
+class AudioInfo(db.Base):
+ """Class representing the audio from a recording in the database."""
+
+ __tablename__ = "audio_info"
+
+ id = sa.Column(sa.Integer, primary_key=True)
+ flac_data = sa.Column(sa.LargeBinary)
+ transcribed_text = sa.Column(sa.String)
+ recording_timestamp = sa.Column(sa.ForeignKey("recording.timestamp"))
+ sample_rate = sa.Column(sa.Integer)
+ words_with_timestamps = sa.Column(sa.Text)
+
+ recording = sa.orm.relationship("Recording", back_populates="audio_info")
+
+
 class PerformanceStat(db.Base):
  """Class representing a performance statistic in the database."""
 

diff --git a/openadapt/record.py b/openadapt/record.py
@@ -4,11 +4,16 @@
 
  $ python openadapt/record.py "<description of task to be recorded>"
 
+To record audio:
+
+ $ python openadapt/record.py "<description of task to be recorded>" --enable_audio
+
 """
 
 from collections import namedtuple
 from functools import partial, wraps
 from typing import Any, Callable, Union
+import io
 import multiprocessing
 import os
 import queue
@@ -24,7 +29,11 @@
 from tqdm import tqdm
 import fire
 import mss.tools
+import numpy as np
 import psutil
+import sounddevice
+import soundfile
+import whisper
 
 from openadapt import config, utils, window
 from openadapt.db import crud
@@ -804,15 +813,101 @@ def read_mouse_events(
  mouse_listener.stop()
 
 
+def record_audio(
+ terminate_event: multiprocessing.Event,
+ recording_timestamp: float,
+) -> None:
+ """Record audio narration during the recording and store data in database.
+
+ Args:
+ terminate_event: The event to signal termination of event reading.
+ recording_timestamp: The timestamp of the recording.
+ """
+ utils.configure_logging(logger, LOG_LEVEL)
+ utils.set_start_time(recording_timestamp)
+
+ audio_frames = [] # to store audio frames
+
+ def audio_callback(
+ indata: np.ndarray, frames: int, time: Any, status: sounddevice.CallbackFlags
+ ) -> None:
+ """Callback function used when new audio frames are recorded.
+
+ Note: time is of type cffi.FFI.CData, but since we don't use this argument
+ and we also don't use the cffi library, the Any type annotation is used.
+ """
+ # called whenever there is new audio frames
+ audio_frames.append(indata.copy())
+
+ # open InputStream and start recording while ActionEvents are recorded
+ audio_stream = sounddevice.InputStream(
+ callback=audio_callback, samplerate=16000, channels=1
+ )
+ logger.info("Audio recording started.")
+ audio_stream.start()
+ terminate_event.wait()
+ audio_stream.stop()
+ audio_stream.close()
+
+ # Concatenate into one Numpy array
+ concatenated_audio = np.concatenate(audio_frames, axis=0)
+ # convert concatenated_audio to format expected by whisper
+ converted_audio = concatenated_audio.flatten().astype(np.float32)
+
+ # Convert audio to text using OpenAI's Whisper
+ logger.info("Transcribing audio...")
+ model = whisper.load_model("base")
+ result_info = model.transcribe(converted_audio, word_timestamps=True, fp16=False)
+ logger.info(f"The narrated text is: {result_info['text']}")
+ # empty word_list if the user didn't say anything
+ word_list = []
+ # segments could be empty
+ if len(result_info["segments"]) > 0:
+ # there won't be a 'words' list if the user didn't say anything
+ if "words" in result_info["segments"][0]:
+ word_list = result_info["segments"][0]["words"]
+
+ # compress and convert to bytes to save to database
+ logger.info(
+ "Size of uncompressed audio data: {} bytes".format(converted_audio.nbytes)
+ )
+ # Create an in-memory file-like object
+ file_obj = io.BytesIO()
+ # Write the audio data using lossless compression
+ soundfile.write(
+ file_obj, converted_audio, int(audio_stream.samplerate), format="FLAC"
+ )
+ # Get the compressed audio data as bytes
+ compressed_audio_bytes = file_obj.getvalue()
+
+ logger.info(
+ "Size of compressed audio data: {} bytes".format(len(compressed_audio_bytes))
+ )
+
+ file_obj.close()
+
+ # To decompress the audio and restore it to its original form:
+ # restored_audio, restored_samplerate = sf.read(
+ # io.BytesIO(compressed_audio_bytes))
+
+ # Create AudioInfo entry
+ crud.insert_audio_info(
+ compressed_audio_bytes,
+ result_info["text"],
+ recording_timestamp,
+ int(audio_stream.samplerate),
+ word_list,
+ )
+
+
 @logger.catch
 @trace(logger)
-def record(
- task_description: str,
-) -> None:
+def record(task_description: str, enable_audio: bool = False) -> None:
  """Record Screenshots/ActionEvents/WindowEvents.
 
  Args:
  task_description: A text description of the task to be recorded.
+ enable_audio: a flag to enable or disable audio recording (default: False)
  """
  logger.info(f"{task_description=}")
 
@@ -943,6 +1038,13 @@ def record(
  )
  mem_plotter.start()
 
+ if enable_audio:
+ audio_recorder = threading.Thread(
+ target=record_audio,
+ args=(terminate_event, recording_timestamp),
+ )
+ audio_recorder.start()
+
  # TODO: discard events until everything is ready
 
  collect_stats()
@@ -972,6 +1074,9 @@ def record(
  screen_event_writer.join()
  action_event_writer.join()
  window_event_writer.join()
+ if enable_audio:
+ audio_recorder.join()
+
  terminate_perf_event.set()
 
  if PLOT_PERFORMANCE:

diff --git a/openadapt/strategies/mixins/openai.py b/openadapt/strategies/mixins/openai.py
@@ -11,7 +11,6 @@ class MyReplayStrategy(OpenAIReplayStrategyMixin):
 
 from loguru import logger
 import openai
-import tiktoken
 
 from openadapt import cache, config, models
 from openadapt.strategies.base import BaseReplayStrategy
@@ -29,7 +28,6 @@ class MyReplayStrategy(OpenAIReplayStrategyMixin):
 MODEL_NAME = "gpt-4"
 
 openai.api_key = config.OPENAI_API_KEY
-encoding = tiktoken.get_encoding("cl100k_base")
 
 
 class OpenAIReplayStrategyMixin(BaseReplayStrategy):
@@ -187,50 +185,3 @@ def _get_completion(prompt: str) -> str:
  logger.debug(f"appending assistant_message=\n{pformat(assistant_message)}")
  messages.append(assistant_message)
  return messages
-
-
-# XXX TODO not currently in use
-# https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
-def num_tokens_from_messages(messages: list, model: str = "gpt-3.5-turbo-0301") -> int:
- """Returns the number of tokens used by a list of messages."""
- try:
- encoding = tiktoken.encoding_for_model(model)
- except KeyError:
- logger.info("Warning: model not found. Using cl100k_base encoding.")
- encoding = tiktoken.get_encoding("cl100k_base")
- if model == "gpt-3.5-turbo":
- logger.info(
- "Warning: gpt-3.5-turbo may change over time. Returning num tokens "
- "assuming gpt-3.5-turbo-0301."
- )
- return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
- elif model == "gpt-4":
- logger.info(
- "Warning: gpt-4 may change over time. Returning num tokens "
- "assuming gpt-4-0314."
- )
- return num_tokens_from_messages(messages, model="gpt-4-0314")
- elif model == "gpt-3.5-turbo-0301":
- tokens_per_message = (
- 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n
- )
- tokens_per_name = -1 # if there's a name, the role is omitted
- elif model == "gpt-4-0314":
- tokens_per_message = 3
- tokens_per_name = 1
- else:
- raise NotImplementedError(
- f"""num_tokens_from_messages() is not implemented for model "
- "{model}. See "
- "https://github.com/openai/openai-python/blob/main/chatml.md for "
- information on how messages are converted to tokens."""
- )
- num_tokens = 0
- for message in messages:
- num_tokens += tokens_per_message
- for key, value in message.items():
- num_tokens += len(encoding.encode(value))
- if key == "name":
- num_tokens += tokens_per_name
- num_tokens += 3 # every reply is primed with <|start|>assistant<|message|>
- return num_tokens
diff --git a/openadapt/visualize.py b/openadapt/visualize.py
@@ -12,7 +12,7 @@
 import click
 
 from openadapt import config
-from openadapt.db.crud import get_latest_recording, get_recording
+from openadapt.db.crud import get_audio_info, get_latest_recording, get_recording
 from openadapt.events import get_events
 from openadapt.utils import (
  EMPTY,
@@ -147,6 +147,10 @@ def main(timestamp: str, notify: bool = True) -> None:
  scrub.scrub_text(recording.task_description)
  logger.debug(f"{recording=}")
 
+ audio_info = row2dict(get_audio_info(recording))
+ # don't display the FLAC data
+ del audio_info["flac_data"]
+
  meta = {}
  action_events = get_events(recording, process=PROCESS_EVENTS, meta=meta)
  event_dicts = rows2dicts(action_events)