From 52ac65c27edc554912e601c312435cc299bc8cfa Mon Sep 17 00:00:00 2001 From: Oto_G <421739728@qq.com> Date: Sun, 26 Jun 2022 08:37:37 +0800 Subject: [PATCH 1/9] add function of audio slicing --- basic_pitch/constants.py | 1 + basic_pitch/inference.py | 119 ++++++++++++++++++++++++++++++++------- 2 files changed, 101 insertions(+), 19 deletions(-) diff --git a/basic_pitch/constants.py b/basic_pitch/constants.py index a78a487b..8479b21a 100644 --- a/basic_pitch/constants.py +++ b/basic_pitch/constants.py @@ -27,6 +27,7 @@ ANNOTATIONS_BASE_FREQUENCY = 27.5 # lowest key on a piano ANNOTATIONS_N_SEMITONES = 88 # number of piano keys AUDIO_SAMPLE_RATE = 22050 +AUDIO_SLICE_TIME = 20 # second of every audio slice AUDIO_N_CHANNELS = 1 N_FREQ_BINS_NOTES = ANNOTATIONS_N_SEMITONES * NOTES_BINS_PER_SEMITONE N_FREQ_BINS_CONTOURS = ANNOTATIONS_N_SEMITONES * CONTOURS_BINS_PER_SEMITONE diff --git a/basic_pitch/inference.py b/basic_pitch/inference.py index 1393ff37..73928e08 100644 --- a/basic_pitch/inference.py +++ b/basic_pitch/inference.py @@ -31,6 +31,7 @@ AUDIO_SAMPLE_RATE, AUDIO_N_SAMPLES, ANNOTATIONS_FPS, + AUDIO_SLICE_TIME, FFT_HOP, ) from basic_pitch import ICASSP_2022_MODEL_PATH, note_creation as infer @@ -70,8 +71,74 @@ def window_audio_file(audio_original: Tensor, hop_size: int) -> Tuple[Tensor, Li return audio_windowed, window_times +def split_unwrapped_data( + unwrapped_list: list +) -> dict: + """ + split sliced unwrapped data and return completed unwrapped data + + Returns: + resDict: dict + unwrapped_data. + + """ + resDict = unwrapped_list[0] + + if len(unwrapped_list) < 2: + return resDict + + for k in resDict.keys(): + for i in range(1, len(unwrapped_list)): + tempDict = unwrapped_list[i] + resDict[k] = np.append(resDict[k], tempDict[k], axis = 0) + + return resDict + + +def slice_audio( + audio_original: np.array +) -> list: + """ + cut audio Array by AUDIO_SLICE_TIME (default 5 sec) * AUDIO_SAMPLE_RATE and return slice list + + Returns: + resList: list + audio slice list. + + """ + resList = [] + original_length = audio_original.shape[0] + + sliceLen = AUDIO_SAMPLE_RATE * AUDIO_SLICE_TIME + partNums = int(np.ceil(original_length / sliceLen)) + + for i in range(partNums): + sliceEnd = sliceLen * i + sliceLen + if sliceEnd > original_length: + sliceEnd = original_length + tempSlice = audio_original[sliceLen * i : sliceEnd] + resList.append(tempSlice) + + return resList + + +def read_audio( + audio_path: Union[pathlib.Path, str] +) -> np.array: + """ + Read wave file (as mono) and return audioArray + + Returns: + audio_original: np.array + original audio array. + + """ + audio_original, _ = librosa.load(str(audio_path), sr=AUDIO_SAMPLE_RATE, mono=True) + return audio_original + + def get_audio_input( - audio_path: Union[pathlib.Path, str], overlap_len: int, hop_size: int + audio_original: np.array, overlap_len: int, hop_size: int ) -> Tuple[Tensor, List[Dict[str, int]], int]: """ Read wave file (as mono), pad appropriately, and return as @@ -87,7 +154,7 @@ def get_audio_input( """ assert overlap_len % 2 == 0, "overlap_length must be even, got {}".format(overlap_len) - audio_original, _ = librosa.load(str(audio_path), sr=AUDIO_SAMPLE_RATE, mono=True) + # audio_original, _ = librosa.load(str(audio_path), sr=AUDIO_SAMPLE_RATE, mono=True) original_length = audio_original.shape[0] audio_original = np.concatenate([np.zeros((int(overlap_len / 2),), dtype=np.float32), audio_original]) @@ -139,23 +206,37 @@ def run_inference( overlap_len = n_overlapping_frames * FFT_HOP hop_size = AUDIO_N_SAMPLES - overlap_len - audio_windowed, _, audio_original_length = get_audio_input(audio_path, overlap_len, hop_size) - - output = model(audio_windowed) - unwrapped_output = {k: unwrap_output(output[k], audio_original_length, n_overlapping_frames) for k in output} - - if debug_file: - with open(debug_file, "w") as f: - json.dump( - { - "audio_windowed": audio_windowed.numpy().tolist(), - "audio_original_length": audio_original_length, - "hop_size_samples": hop_size, - "overlap_length_samples": overlap_len, - "unwrapped_output": {k: v.tolist() for k, v in unwrapped_output.items()}, - }, - f, - ) + # slice audio + audio_original = read_audio(audio_path) + audio_slice_list = slice_audio(audio_original) + + unwrapped_list = [] + + for i in range(len(audio_slice_list)): + audio_original_slice = audio_slice_list[i] + + audio_windowed_slice, _, audio_original_length_slice = get_audio_input(audio_original_slice, overlap_len, hop_size) + + output = model(audio_windowed_slice) + + unwrapped_output_slice = {k: unwrap_output(output[k], audio_original_length_slice, n_overlapping_frames) for k in output} + unwrapped_list.append(unwrapped_output_slice) + + if debug_file: + with open(debug_file, "a") as f: + json.dump( + { + "slice_ID": i, + "audio_windowed_slice": audio_windowed_slice.numpy().tolist(), + "audio_original_length_slice": audio_original_length_slice, + "hop_size_samples": hop_size, + "overlap_length_samples": overlap_len, + "unwrapped_output": {k: v.tolist() for k, v in unwrapped_output.items()}, + }, + f, + ) + # merge all sliced unwrapped output + unwrapped_output = split_unwrapped_data(unwrapped_list) return unwrapped_output From a2b5266de2200abe23b70f9c4d58c1180fd8cfa1 Mon Sep 17 00:00:00 2001 From: Oto_G <421739728@qq.com> Date: Sun, 26 Jun 2022 10:03:38 +0800 Subject: [PATCH 2/9] tox --- basic_pitch/inference.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/basic_pitch/inference.py b/basic_pitch/inference.py index 73928e08..4b3fcf01 100644 --- a/basic_pitch/inference.py +++ b/basic_pitch/inference.py @@ -72,8 +72,8 @@ def window_audio_file(audio_original: Tensor, hop_size: int) -> Tuple[Tensor, Li def split_unwrapped_data( - unwrapped_list: list -) -> dict: + unwrapped_list: List[Dict[str, np.array]] +) -> Dict[str, np.array]: """ split sliced unwrapped data and return completed unwrapped data @@ -97,7 +97,7 @@ def split_unwrapped_data( def slice_audio( audio_original: np.array -) -> list: +) -> List[np.array]: """ cut audio Array by AUDIO_SLICE_TIME (default 5 sec) * AUDIO_SAMPLE_RATE and return slice list @@ -231,7 +231,7 @@ def run_inference( "audio_original_length_slice": audio_original_length_slice, "hop_size_samples": hop_size, "overlap_length_samples": overlap_len, - "unwrapped_output": {k: v.tolist() for k, v in unwrapped_output.items()}, + "unwrapped_output_slice": {k: v.tolist() for k, v in unwrapped_output_slice.items()}, }, f, ) From 4a715e61e690575d82de8c2f2c3b80d8312126bd Mon Sep 17 00:00:00 2001 From: Oto_G <421739728@qq.com> Date: Sun, 26 Jun 2022 11:36:52 +0800 Subject: [PATCH 3/9] pass flake8 --- basic_pitch/inference.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/basic_pitch/inference.py b/basic_pitch/inference.py index 4b3fcf01..18dcaef6 100644 --- a/basic_pitch/inference.py +++ b/basic_pitch/inference.py @@ -86,12 +86,12 @@ def split_unwrapped_data( if len(unwrapped_list) < 2: return resDict - + for k in resDict.keys(): for i in range(1, len(unwrapped_list)): tempDict = unwrapped_list[i] - resDict[k] = np.append(resDict[k], tempDict[k], axis = 0) - + resDict[k] = np.append(resDict[k], tempDict[k], axis=0) + return resDict @@ -215,11 +215,12 @@ def run_inference( for i in range(len(audio_slice_list)): audio_original_slice = audio_slice_list[i] - audio_windowed_slice, _, audio_original_length_slice = get_audio_input(audio_original_slice, overlap_len, hop_size) - + audio_windowed_slice, _, audio_original_length_slice = \ + get_audio_input(audio_original_slice, overlap_len, hop_size) output = model(audio_windowed_slice) - unwrapped_output_slice = {k: unwrap_output(output[k], audio_original_length_slice, n_overlapping_frames) for k in output} + unwrapped_output_slice = \ + {k: unwrap_output(output[k], audio_original_length_slice, n_overlapping_frames) for k in output} unwrapped_list.append(unwrapped_output_slice) if debug_file: @@ -234,7 +235,7 @@ def run_inference( "unwrapped_output_slice": {k: v.tolist() for k, v in unwrapped_output_slice.items()}, }, f, - ) + ) # merge all sliced unwrapped output unwrapped_output = split_unwrapped_data(unwrapped_list) From 2a9c5855b37e890de20c5062d72403c9c065d9f1 Mon Sep 17 00:00:00 2001 From: Oto_G <421739728@qq.com> Date: Sun, 26 Jun 2022 11:53:50 +0800 Subject: [PATCH 4/9] pass test --- basic_pitch/inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/basic_pitch/inference.py b/basic_pitch/inference.py index 18dcaef6..ab5760c4 100644 --- a/basic_pitch/inference.py +++ b/basic_pitch/inference.py @@ -214,7 +214,7 @@ def run_inference( for i in range(len(audio_slice_list)): audio_original_slice = audio_slice_list[i] - + audio_windowed_slice, _, audio_original_length_slice = \ get_audio_input(audio_original_slice, overlap_len, hop_size) output = model(audio_windowed_slice) From f0ccd8618d969d400de5440f04ea9f1447d3a7e1 Mon Sep 17 00:00:00 2001 From: Oto_G <421739728@qq.com> Date: Sun, 26 Jun 2022 12:05:20 +0800 Subject: [PATCH 5/9] fix --- basic_pitch/inference.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/basic_pitch/inference.py b/basic_pitch/inference.py index ab5760c4..18a917b6 100644 --- a/basic_pitch/inference.py +++ b/basic_pitch/inference.py @@ -215,12 +215,13 @@ def run_inference( for i in range(len(audio_slice_list)): audio_original_slice = audio_slice_list[i] - audio_windowed_slice, _, audio_original_length_slice = \ + audio_windowed_slice, _, audio_original_length_slice = ( get_audio_input(audio_original_slice, overlap_len, hop_size) + ) output = model(audio_windowed_slice) - - unwrapped_output_slice = \ + unwrapped_output_slice = ( {k: unwrap_output(output[k], audio_original_length_slice, n_overlapping_frames) for k in output} + ) unwrapped_list.append(unwrapped_output_slice) if debug_file: From d8850180f482d74463e8667742b394bff2c8ab0b Mon Sep 17 00:00:00 2001 From: Oto_G <421739728@qq.com> Date: Sun, 26 Jun 2022 12:20:57 +0800 Subject: [PATCH 6/9] go --- basic_pitch/inference.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/basic_pitch/inference.py b/basic_pitch/inference.py index 18a917b6..94707232 100644 --- a/basic_pitch/inference.py +++ b/basic_pitch/inference.py @@ -71,9 +71,7 @@ def window_audio_file(audio_original: Tensor, hop_size: int) -> Tuple[Tensor, Li return audio_windowed, window_times -def split_unwrapped_data( - unwrapped_list: List[Dict[str, np.array]] -) -> Dict[str, np.array]: +def split_unwrapped_data(unwrapped_list: List[Dict[str, np.array]]) -> Dict[str, np.array]: """ split sliced unwrapped data and return completed unwrapped data @@ -95,9 +93,7 @@ def split_unwrapped_data( return resDict -def slice_audio( - audio_original: np.array -) -> List[np.array]: +def slice_audio(audio_original: np.array) -> List[np.array]: """ cut audio Array by AUDIO_SLICE_TIME (default 5 sec) * AUDIO_SAMPLE_RATE and return slice list @@ -122,9 +118,7 @@ def slice_audio( return resList -def read_audio( - audio_path: Union[pathlib.Path, str] -) -> np.array: +def read_audio(audio_path: Union[pathlib.Path, str]) -> np.array: """ Read wave file (as mono) and return audioArray @@ -215,13 +209,13 @@ def run_inference( for i in range(len(audio_slice_list)): audio_original_slice = audio_slice_list[i] - audio_windowed_slice, _, audio_original_length_slice = ( - get_audio_input(audio_original_slice, overlap_len, hop_size) + audio_windowed_slice, _, audio_original_length_slice = get_audio_input( + audio_original_slice, overlap_len, hop_size ) output = model(audio_windowed_slice) - unwrapped_output_slice = ( - {k: unwrap_output(output[k], audio_original_length_slice, n_overlapping_frames) for k in output} - ) + unwrapped_output_slice = { + k: unwrap_output(output[k], audio_original_length_slice, n_overlapping_frames) for k in output + } unwrapped_list.append(unwrapped_output_slice) if debug_file: From f618eca543d0751685381523053963b5c7ecd1e1 Mon Sep 17 00:00:00 2001 From: Oto_G <421739728@qq.com> Date: Sun, 26 Jun 2022 12:40:53 +0800 Subject: [PATCH 7/9] go --- basic_pitch/inference.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/basic_pitch/inference.py b/basic_pitch/inference.py index 94707232..aa2d1f60 100644 --- a/basic_pitch/inference.py +++ b/basic_pitch/inference.py @@ -73,11 +73,10 @@ def window_audio_file(audio_original: Tensor, hop_size: int) -> Tuple[Tensor, Li def split_unwrapped_data(unwrapped_list: List[Dict[str, np.array]]) -> Dict[str, np.array]: """ - split sliced unwrapped data and return completed unwrapped data + Merge the split model inference results and return the complete result. Returns: - resDict: dict - unwrapped_data. + A dictionary with the notes, onsets and contours. """ resDict = unwrapped_list[0] @@ -95,10 +94,11 @@ def split_unwrapped_data(unwrapped_list: List[Dict[str, np.array]]) -> Dict[str, def slice_audio(audio_original: np.array) -> List[np.array]: """ - cut audio Array by AUDIO_SLICE_TIME (default 5 sec) * AUDIO_SAMPLE_RATE and return slice list + Cut audio Array by AUDIO_SLICE_TIME (default 5 sec) * AUDIO_SAMPLE_RATE + and return slice list Returns: - resList: list + resList: list of slice audio slice list. """ @@ -120,11 +120,11 @@ def slice_audio(audio_original: np.array) -> List[np.array]: def read_audio(audio_path: Union[pathlib.Path, str]) -> np.array: """ - Read wave file (as mono) and return audioArray + Read wave file (as mono) and return audio signal Returns: audio_original: np.array - original audio array. + original audio signal. """ audio_original, _ = librosa.load(str(audio_path), sr=AUDIO_SAMPLE_RATE, mono=True) @@ -135,7 +135,7 @@ def get_audio_input( audio_original: np.array, overlap_len: int, hop_size: int ) -> Tuple[Tensor, List[Dict[str, int]], int]: """ - Read wave file (as mono), pad appropriately, and return as + padding appropriately of audio signal, and return as windowed signal, with window length = AUDIO_N_SAMPLES Returns: @@ -231,7 +231,7 @@ def run_inference( }, f, ) - # merge all sliced unwrapped output + # merge all unwrapped_output = split_unwrapped_data(unwrapped_list) return unwrapped_output From fc160ff590da8bc525a4996aa5dc7259444d4423 Mon Sep 17 00:00:00 2001 From: Oto_G <421739728@qq.com> Date: Sun, 26 Jun 2022 12:44:18 +0800 Subject: [PATCH 8/9] add AUDIO_SLICE_TIME --- basic_pitch/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/basic_pitch/constants.py b/basic_pitch/constants.py index 8479b21a..aede4ae1 100644 --- a/basic_pitch/constants.py +++ b/basic_pitch/constants.py @@ -27,7 +27,7 @@ ANNOTATIONS_BASE_FREQUENCY = 27.5 # lowest key on a piano ANNOTATIONS_N_SEMITONES = 88 # number of piano keys AUDIO_SAMPLE_RATE = 22050 -AUDIO_SLICE_TIME = 20 # second of every audio slice +AUDIO_SLICE_TIME = 20 # seconds of every audio slice AUDIO_N_CHANNELS = 1 N_FREQ_BINS_NOTES = ANNOTATIONS_N_SEMITONES * NOTES_BINS_PER_SEMITONE N_FREQ_BINS_CONTOURS = ANNOTATIONS_N_SEMITONES * CONTOURS_BINS_PER_SEMITONE From 47d5d669e12a6d47f73883a3ea791ae5de932c06 Mon Sep 17 00:00:00 2001 From: Oto_G <421739728@qq.com> Date: Sun, 26 Jun 2022 12:49:23 +0800 Subject: [PATCH 9/9] pass test --- basic_pitch/inference.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/basic_pitch/inference.py b/basic_pitch/inference.py index aa2d1f60..3a807752 100644 --- a/basic_pitch/inference.py +++ b/basic_pitch/inference.py @@ -94,8 +94,7 @@ def split_unwrapped_data(unwrapped_list: List[Dict[str, np.array]]) -> Dict[str, def slice_audio(audio_original: np.array) -> List[np.array]: """ - Cut audio Array by AUDIO_SLICE_TIME (default 5 sec) * AUDIO_SAMPLE_RATE - and return slice list + Cut audio Array by AUDIO_SLICE_TIME (default 5 sec) * AUDIO_SAMPLE_RATE and return slice list Returns: resList: list of slice