112 lines
4.3 KiB
Python
112 lines
4.3 KiB
Python
|
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
# =============================================================================
|
||
|
"""Encoding and decoding audio using FFmpeg."""
|
||
|
|
||
|
from __future__ import absolute_import
|
||
|
from __future__ import division
|
||
|
from __future__ import print_function
|
||
|
|
||
|
from tensorflow.contrib.ffmpeg.ops import gen_decode_audio_op_py
|
||
|
from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
|
||
|
from tensorflow.contrib.ffmpeg.ops import gen_encode_audio_op_py
|
||
|
from tensorflow.contrib.util import loader
|
||
|
from tensorflow.python.framework import ops
|
||
|
from tensorflow.python.platform import resource_loader
|
||
|
|
||
|
_ffmpeg_so = loader.load_op_library(
|
||
|
resource_loader.get_path_to_datafile('ffmpeg.so'))
|
||
|
|
||
|
|
||
|
def decode_audio(contents, file_format=None, samples_per_second=None,
|
||
|
channel_count=None, stream=None):
|
||
|
"""Create an op that decodes the contents of an audio file.
|
||
|
|
||
|
Note that ffmpeg is free to select the "best" audio track from an mp4.
|
||
|
https://trac.ffmpeg.org/wiki/Map
|
||
|
|
||
|
Args:
|
||
|
contents: The binary contents of the audio file to decode. This is a
|
||
|
scalar.
|
||
|
file_format: A string or scalar string tensor specifying which
|
||
|
format the contents will conform to. This can be mp3, mp4, ogg,
|
||
|
or wav.
|
||
|
samples_per_second: The number of samples per second that is
|
||
|
assumed, as an `int` or scalar `int32` tensor. In some cases,
|
||
|
resampling will occur to generate the correct sample rate.
|
||
|
channel_count: The number of channels that should be created from the
|
||
|
audio contents, as an `int` or scalar `int32` tensor. If the
|
||
|
`contents` have more than this number, then some channels will
|
||
|
be merged or dropped. If `contents` has fewer than this, then
|
||
|
additional channels will be created from the existing ones.
|
||
|
stream: A string specifying which stream from the content file
|
||
|
should be decoded, e.g., '0' means the 0-th stream.
|
||
|
The default value is '' which leaves the decision to ffmpeg.
|
||
|
|
||
|
Returns:
|
||
|
A rank-2 tensor that has time along dimension 0 and channels along
|
||
|
dimension 1. Dimension 0 will be `samples_per_second *
|
||
|
length_in_seconds` wide, and dimension 1 will be `channel_count`
|
||
|
wide. If ffmpeg fails to decode the audio then an empty tensor will
|
||
|
be returned.
|
||
|
"""
|
||
|
return gen_decode_audio_op_py.decode_audio_v2(
|
||
|
contents, file_format=file_format, samples_per_second=samples_per_second,
|
||
|
channel_count=channel_count, stream=stream)
|
||
|
|
||
|
|
||
|
ops.NotDifferentiable('DecodeAudio')
|
||
|
|
||
|
|
||
|
def encode_audio(audio, file_format=None, samples_per_second=None):
|
||
|
"""Creates an op that encodes an audio file using sampled audio from a tensor.
|
||
|
|
||
|
Args:
|
||
|
audio: A rank-2 `Tensor` that has time along dimension 0 and
|
||
|
channels along dimension 1. Dimension 0 is `samples_per_second *
|
||
|
length_in_seconds` long.
|
||
|
file_format: The type of file to encode, as a string or rank-0
|
||
|
string tensor. "wav" is the only supported format.
|
||
|
samples_per_second: The number of samples in the audio tensor per
|
||
|
second of audio, as an `int` or rank-0 `int32` tensor.
|
||
|
|
||
|
Returns:
|
||
|
A scalar tensor that contains the encoded audio in the specified file
|
||
|
format.
|
||
|
"""
|
||
|
return gen_encode_audio_op_py.encode_audio_v2(
|
||
|
audio,
|
||
|
file_format=file_format,
|
||
|
samples_per_second=samples_per_second,
|
||
|
bits_per_second=192000) # not used by WAV
|
||
|
|
||
|
|
||
|
ops.NotDifferentiable('EncodeAudio')
|
||
|
|
||
|
|
||
|
def decode_video(contents):
|
||
|
"""Create an op that decodes the contents of a video file.
|
||
|
|
||
|
Args:
|
||
|
contents: The binary contents of the video file to decode. This is a
|
||
|
scalar.
|
||
|
|
||
|
Returns:
|
||
|
A rank-4 `Tensor` that has `[frames, height, width, 3]` RGB as output.
|
||
|
"""
|
||
|
return gen_decode_video_op_py.decode_video(contents)
|
||
|
|
||
|
|
||
|
ops.NotDifferentiable('DecodeVideo')
|