Skip to content

Commit

Permalink
feat: language settings (#509)
Browse files Browse the repository at this point in the history
  • Loading branch information
EzraEllette authored Oct 16, 2024
1 parent fc28418 commit 0645c84
Show file tree
Hide file tree
Showing 44 changed files with 1,226 additions and 347 deletions.
570 changes: 329 additions & 241 deletions screenpipe-app-tauri/components/recording-settings.tsx

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions screenpipe-app-tauri/lib/hooks/use-settings.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { localDataDir, join } from "@tauri-apps/api/path";
import { platform } from "@tauri-apps/plugin-os";
import { Pipe } from "./use-pipes";
import posthog from "posthog-js";
import {Language} from "@/lib/language";

export type VadSensitivity = "low" | "medium" | "high";
export type EmbeddedLLMConfig = {
Expand Down Expand Up @@ -40,6 +41,7 @@ export interface Settings {
audioChunkDuration: number; // new field
useChineseMirror: boolean; // Add this line
embeddedLLM: EmbeddedLLMConfig;
languages: Language[],
}

const defaultSettings: Settings = {
Expand Down Expand Up @@ -75,6 +77,7 @@ const defaultSettings: Settings = {
analyticsEnabled: true,
audioChunkDuration: 30, // default to 10 seconds
useChineseMirror: false, // Add this line
languages: [],
embeddedLLM: {
enabled: false,
model: "llama3.2:1b-instruct-q4_K_M",
Expand Down Expand Up @@ -179,6 +182,10 @@ export function useSettings() {
model: "llama3.2:1b-instruct-q4_K_M",
port: 11438,
};

const savedLanguages =
(await store!.get<Language[]>("languages")) || [];

const currentPlatform = await platform();
const defaultIgnoredWindows =
currentPlatform === "macos" // TODO: windows and linux
Expand Down Expand Up @@ -233,6 +240,7 @@ export function useSettings() {
audioChunkDuration: savedAudioChunkDuration,
useChineseMirror: savedUseChineseMirror,
embeddedLLM: savedEmbeddedLLM,
languages: savedLanguages,
});
} catch (error) {
console.error("failed to load settings:", error);
Expand Down
77 changes: 77 additions & 0 deletions screenpipe-app-tauri/lib/language.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
export enum Language {
English = "english",
Chinese = "chinese",
German = "german",
Spanish = "spanish",
Russian = "russian",
Korean = "korean",
French = "french",
Japanese = "japanese",
Portuguese = "portuguese",
Turkish = "turkish",
Polish = "polish",
Catalan = "catalan",
Dutch = "dutch",
Arabic = "arabic",
Swedish = "swedish",
Italian = "italian",
Indonesian = "indonesian",
Hindi = "hindi",
Finnish = "finnish",
Hebrew = "hebrew",
Ukrainian = "ukrainian",
Greek = "greek",
Malay = "malay",
Czech = "czech",
Romanian = "romanian",
Danish = "danish",
Hungarian = "hungarian",
Norwegian = "norwegian",
Thai = "thai",
Urdu = "urdu",
Croatian = "croatian",
Bulgarian = "bulgarian",
Lithuanian = "lithuanian",
Latin = "latin",
Malayalam = "malayalam",
Welsh = "welsh",
Slovak = "slovak",
Persian = "persian",
Latvian = "latvian",
Bengali = "bengali",
Serbian = "serbian",
Azerbaijani = "azerbaijani",
Slovenian = "slovenian",
Estonian = "estonian",
Macedonian = "macedonian",
Nepali = "nepali",
Mongolian = "mongolian",
Bosnian = "bosnian",
Kazakh = "kazakh",
Albanian = "albanian",
Swahili = "swahili",
Galician = "galician",
Marathi = "marathi",
Punjabi = "punjabi",
Sinhala = "sinhala",
Khmer = "khmer",
Afrikaans = "afrikaans",
Belarusian = "belarusian",
Gujarati = "gujarati",
Amharic = "amharic",
Yiddish = "yiddish",
Lao = "lao",
Uzbek = "uzbek",
Faroese = "faroese",
Pashto = "pashto",
Maltese = "maltese",
Sanskrit = "sanskrit",
Luxembourgish = "luxembourgish",
Myanmar = "myanmar",
Tibetan = "tibetan",
Tagalog = "tagalog",
Assamese = "assamese",
Tatar = "tatar",
Hausa = "hausa",
Javanese = "javanese"
}
12 changes: 12 additions & 0 deletions screenpipe-app-tauri/src-tauri/src/sidecar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,11 @@ fn spawn_sidecar(app: &tauri::AppHandle) -> Result<CommandChild, String> {
.and_then(|v| v.as_bool())
.unwrap_or(false);

let languages = store
.get("languages")
.and_then(|v| v.as_array().cloned())
.unwrap_or_default();

println!("audio_chunk_duration: {}", audio_chunk_duration);

let port_str = port.to_string();
Expand Down Expand Up @@ -197,6 +202,13 @@ fn spawn_sidecar(app: &tauri::AppHandle) -> Result<CommandChild, String> {
}
}

if !languages.is_empty() && languages[0] != Value::String("default".to_string()) {
for language in &languages {
args.push("--language");
args.push(language.as_str().unwrap());
}
}

if deepgram_api_key != "default" {
args.push("--deepgram-api-key");
let key = deepgram_api_key.as_str();
Expand Down
9 changes: 6 additions & 3 deletions screenpipe-audio/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,12 @@ regex = "1.11.0"


[target.'cfg(target_os = "windows")'.dependencies]
ort = { version = "2.0.0-rc.5", features = ["download-binaries", "copy-dylibs", "directml", "cuda"] }
ort = { version = "2.0.0-rc.5", features = [
"download-binaries",
"copy-dylibs",
"directml",
"cuda",
] }
esaxx-rs = "0.1.10"
samplerate = { version = "0.2.4" }
libsamplerate-sys = "0.1.10"
Expand Down Expand Up @@ -108,5 +113,3 @@ harness = false
[[bench]]
name = "record_and_transcribe_benchmark"
harness = false


Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ async fn setup_test() -> (
None,
&output_path,
VadSensitivity::High,
vec![],
)
.await
.unwrap();
Expand Down
42 changes: 20 additions & 22 deletions screenpipe-audio/src/audio_processing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,25 +38,24 @@ pub fn spectral_subtraction(audio: &[f32], d: f32) -> Result<Vec<f32>> {
let mut indata = padded_audio;
r2c.process(&mut indata, &mut y)?;

let mut processed_audio = y.iter().map(|&x|{


let magnitude_y = x.abs().powf(2.0);

let div = 1.0 - (d / magnitude_y) as f32;

let mut processed_audio = y
.iter()
.map(|&x| {
let magnitude_y = x.abs().powf(2.0);

let gain = {
if div > 0.0 {
f32::sqrt(div)
} else {
0.0f32
}
};
let div = 1.0 - (d / magnitude_y);

x * gain
let gain = {
if div > 0.0 {
f32::sqrt(div)
} else {
0.0f32
}
};

}).collect::<Vec<Complex32>>();
x * gain
})
.collect::<Vec<Complex32>>();

let c2r = real_planner.plan_fft_inverse(window_size);

Expand All @@ -67,18 +66,17 @@ pub fn spectral_subtraction(audio: &[f32], d: f32) -> Result<Vec<f32>> {
Ok(outdata)
}


// not an average of non-speech segments, but I don't know how much pause time we
// get. for now, we will just assume the noise is constant (kinda defeats the purpose)
// but oh well
pub fn average_noise_spectrum(audio: &[f32]) -> f32 {
let mut total_sum = 0.0f32;
let mut total_sum = 0.0f32;

for sample in audio {
let magnitude = sample.abs();
for sample in audio {
let magnitude = sample.abs();

total_sum += magnitude.powf(2.0);
}
total_sum += magnitude.powf(2.0);
}

total_sum / audio.len() as f32
}
9 changes: 8 additions & 1 deletion screenpipe-audio/src/bin/screenpipe-audio-forever.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use screenpipe_audio::vad_engine::VadSensitivity;
use screenpipe_audio::AudioDevice;
use screenpipe_audio::AudioTranscriptionEngine;
use screenpipe_audio::VadEngineEnum;
use screenpipe_core::Language;
use std::path::PathBuf;
use std::sync::atomic::AtomicBool;
use std::sync::Arc;
Expand All @@ -34,11 +35,14 @@ struct Args {

#[clap(long, help = "Deepgram API key")]
deepgram_api_key: Option<String>,

#[clap(short = 'l', long, value_enum)]
language: Vec<Language>,
}

fn print_devices(devices: &[AudioDevice]) {
println!("Available audio devices:");
for (_, device) in devices.iter().enumerate() {
for device in devices.iter() {
println!(" {}", device);
}

Expand All @@ -60,6 +64,8 @@ async fn main() -> Result<()> {

let args = Args::parse();

let languages = args.language;

let devices = list_audio_devices().await?;

if args.list_audio_devices {
Expand Down Expand Up @@ -87,6 +93,7 @@ async fn main() -> Result<()> {
args.deepgram_api_key,
&PathBuf::from("output.mp4"),
VadSensitivity::Medium,
languages,
)
.await?;
// Spawn threads for each device
Expand Down
15 changes: 11 additions & 4 deletions screenpipe-audio/src/bin/screenpipe-audio.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use screenpipe_audio::vad_engine::VadSensitivity;
use screenpipe_audio::AudioDevice;
use screenpipe_audio::AudioTranscriptionEngine;
use screenpipe_audio::VadEngineEnum;
use screenpipe_core::Language;
use std::path::PathBuf;
use std::sync::atomic::AtomicBool;
use std::sync::Arc;
Expand All @@ -29,13 +30,16 @@ struct Args {
#[clap(long, help = "List available audio devices")]
list_audio_devices: bool,

#[clap(short = 'l', long, value_enum)]
language: Vec<Language>,

#[clap(long, help = "Deepgram API key")]
deepgram_api_key: Option<String>,
}

fn print_devices(devices: &[AudioDevice]) {
println!("Available audio devices:");
for (_, device) in devices.iter().enumerate() {
for device in devices.iter() {
println!(" {}", device);
}

Expand Down Expand Up @@ -64,6 +68,8 @@ async fn main() -> Result<()> {
return Ok(());
}

let languages = args.language;

let deepgram_api_key = args.deepgram_api_key;

let devices = if args.audio_device.is_empty() {
Expand Down Expand Up @@ -91,13 +97,13 @@ async fn main() -> Result<()> {
deepgram_api_key,
&output_path,
VadSensitivity::Medium,
languages,
)
.await?;
// Spawn threads for each device
let recording_threads: Vec<_> = devices
.into_iter()
.enumerate()
.map(|(_, device)| {
.map(|device| {
let device = Arc::new(device);
let whisper_sender = whisper_sender.clone();
let device_control = Arc::new(AtomicBool::new(true));
Expand All @@ -113,6 +119,7 @@ async fn main() -> Result<()> {
whisper_sender,
device_control_clone,
)
.await
})
})
.collect();
Expand All @@ -139,7 +146,7 @@ async fn main() -> Result<()> {

// Wait for all recording threads to finish
for (i, thread) in recording_threads.into_iter().enumerate() {
let file_path = thread.await.unwrap().await;
let file_path = thread.await.unwrap();
println!("Recording {} complete: {:?}", i, file_path);
}

Expand Down
2 changes: 2 additions & 0 deletions screenpipe-audio/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ pub mod pcm_decode;
pub mod stt;
pub mod vad_engine;
pub mod whisper;
mod tokenizer;

pub use core::{
default_input_device, default_output_device, list_audio_devices, parse_audio_device,
record_and_transcribe, AudioDevice, AudioTranscriptionEngine, DeviceControl, DeviceType,
Expand Down
Loading

0 comments on commit 0645c84

Please sign in to comment.