Caffe2 - C++ API
A deep learning, cross platform ML framework
video_decoder.h
1 #ifndef CAFFE2_VIDEO_VIDEO_DECODER_H_
2 #define CAFFE2_VIDEO_VIDEO_DECODER_H_
3 
4 #include <caffe2/core/logging.h>
5 #include <stdio.h>
6 #include <memory>
7 #include <string>
8 #include <vector>
9 
10 extern "C" {
11 #include <libavformat/avformat.h>
12 #include <libavformat/avio.h>
13 }
14 
15 namespace caffe2 {
16 
17 #define VIO_BUFFER_SZ 32768
18 #define MAX_DECODING_FRAMES 10000
19 
20 // enum to specify 3 special fps sampling behaviors:
21 // 0: disable fps sampling, no frame sampled at all
22 // -1: unlimited fps sampling, will sample at native video fps
23 // -2: disable fps sampling, but will get the frame at specific timestamp
24 enum SpecialFps {
25  SAMPLE_NO_FRAME = 0,
26  SAMPLE_ALL_FRAMES = -1,
27  SAMPLE_TIMESTAMP_ONLY = -2,
28 };
29 
30 // three different types of resolution when decoding the video
31 // 0: resize to width x height and ignore the aspect ratio;
32 // 1: resize to make size at least (width x height) and keep the aspect ratio;
33 // 2: using the original resolution of the video; if resolution
34 // is smaller than crop_height x crop_width, resize to ensure
35 // new height >= crop_height and new width >= crop_width
36 // and keep the aspect ratio;
37 enum VideoResType {
38  USE_WIDTH_HEIGHT = 0,
39  USE_MINIMAL_WIDTH_HEIGHT = 1,
40  ORIGINAL_RES = 2,
41 };
42 
43 // three different types of decoding behavior are supported
44 // 0: do temporal jittering to sample a random clip from the video
45 // 1: sample a clip from a given starting frame
46 // 2: uniformly sample multiple clips from the video;
47 enum DecodeType {
48  DO_TMP_JITTER = 0,
49  DO_UNIFORM_SMP = 1,
50  USE_START_FRM = 2,
51 };
52 
53 // sampling interval for fps starting at specified timestamp
54 // use enum SpecialFps to set special fps decoding behavior
55 // note sampled fps will not always accurately follow the target fps,
56 // because sampled frame has to snap to actual frame timestamp,
57 // e.g. video fps = 25, sample fps = 4 will sample every 0.28s, not 0.25
58 // video fps = 25, sample fps = 5 will sample every 0.24s, not 0.2,
59 // because of floating-point division accuracy (1 / 5.0 is not exactly 0.2)
61  double timestamp;
62  double fps;
63  SampleInterval() : timestamp(-1), fps(SpecialFps::SAMPLE_ALL_FRAMES) {}
64  SampleInterval(double ts, double f) : timestamp(ts), fps(f) {}
65  bool operator<(const SampleInterval& itvl) const {
66  return (timestamp < itvl.timestamp);
67  }
68 };
69 
70 class Params {
71  public:
72  // return all key-frames regardless of specified fps
73  bool keyFrames_ = false;
74 
75  // Output image pixel format
76  AVPixelFormat pixelFormat_ = AVPixelFormat::AV_PIX_FMT_RGB24;
77 
78  // Index of stream to decode.
79  // -1 will automatically decode the first video stream.
80  int streamIndex_ = -1;
81 
82  // How many frames to output at most from the video
83  // -1 no limit
84  int maximumOutputFrames_ = -1;
85 
86  // params for video resolution
87  int video_res_type_ = VideoResType::USE_WIDTH_HEIGHT;
88  int crop_height_ = -1;
89  int crop_width_ = -1;
90  int height_min_ = -1;
91  int width_min_ = -1;
92  int scale_w_ = -1;
93  int scale_h_ = -1;
94 
95  // params for decoding behavior
96  int decode_type_ = DecodeType::DO_TMP_JITTER;
97  int num_of_required_frame_ = -1;
98 
99  // intervals_ control variable sampling fps between different timestamps
100  // intervals_ must be ordered strictly ascending by timestamps
101  // the first interval must have a timestamp of zero
102  // fps must be either the 3 special fps defined in SpecialFps, or > 0
103  std::vector<SampleInterval> intervals_ = {{0, SpecialFps::SAMPLE_ALL_FRAMES}};
104 
105  Params() {}
106 
112  Params& fps(float v) {
113  intervals_.clear();
114  intervals_.emplace_back(0, v);
115  return *this;
116  }
117 
121  Params& pixelFormat(AVPixelFormat pixelFormat) {
122  pixelFormat_ = pixelFormat;
123  return *this;
124  }
125 
129  Params& keyFrames(bool keyFrames) {
130  keyFrames_ = keyFrames;
131  return *this;
132  }
133 
137  Params& streamIndex(int index) {
138  streamIndex_ = index;
139  return *this;
140  }
141 
145  Params& maxOutputFrames(int count) {
146  maximumOutputFrames_ = count;
147  return *this;
148  }
149 
153  Params& outputWidth(int width) {
154  scale_w_ = width;
155  return *this;
156  }
157 
161  Params& outputHeight(int height) {
162  scale_h_ = height;
163  return *this;
164  }
165 };
166 
167 // data structure for storing decoded video frames
169  public:
170  struct avDeleter {
171  void operator()(unsigned char* p) const {
172  av_free(p);
173  }
174  };
175  using AvDataPtr = std::unique_ptr<uint8_t, avDeleter>;
176 
177  // decoded data buffer
178  AvDataPtr data_;
179 
180  // size in bytes
181  int size_ = 0;
182 
183  // frame dimensions
184  int width_ = 0;
185  int height_ = 0;
186 
187  // timestamp in seconds since beginning of video
188  double timestamp_ = 0;
189 
190  // true if this is a key frame.
191  bool keyFrame_ = false;
192 
193  // index of frame in video
194  int index_ = -1;
195 
196  // Sequential number of outputted frame
197  int outputFrameIndex_ = -1;
198 };
199 
201  public:
202  explicit VideoIOContext(const std::string& fname)
203  : workBuffersize_(VIO_BUFFER_SZ),
204  workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
205  inputFile_(nullptr),
206  inputBuffer_(nullptr),
207  inputBufferSize_(0) {
208  inputFile_ = fopen(fname.c_str(), "rb");
209  if (inputFile_ == nullptr) {
210  LOG(ERROR) << "Error opening video file " << fname;
211  }
212  ctx_ = avio_alloc_context(
213  static_cast<unsigned char*>(workBuffer_.get()),
214  workBuffersize_,
215  0,
216  this,
217  &VideoIOContext::readFile,
218  nullptr, // no write function
219  &VideoIOContext::seekFile);
220  }
221 
222  explicit VideoIOContext(const char* buffer, int size)
223  : workBuffersize_(VIO_BUFFER_SZ),
224  workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
225  inputFile_(nullptr),
226  inputBuffer_(buffer),
227  inputBufferSize_(size) {
228  ctx_ = avio_alloc_context(
229  static_cast<unsigned char*>(workBuffer_.get()),
230  workBuffersize_,
231  0,
232  this,
233  &VideoIOContext::readMemory,
234  nullptr, // no write function
235  &VideoIOContext::seekMemory);
236  }
237 
238  ~VideoIOContext() {
239  av_free(ctx_);
240  if (inputFile_) {
241  fclose(inputFile_);
242  }
243  }
244 
245  int read(unsigned char* buf, int buf_size) {
246  if (inputBuffer_) {
247  return readMemory(this, buf, buf_size);
248  } else if (inputFile_) {
249  return readFile(this, buf, buf_size);
250  } else {
251  return -1;
252  }
253  }
254 
255  int64_t seek(int64_t offset, int whence) {
256  if (inputBuffer_) {
257  return seekMemory(this, offset, whence);
258  } else if (inputFile_) {
259  return seekFile(this, offset, whence);
260  } else {
261  return -1;
262  }
263  }
264 
265  static int readFile(void* opaque, unsigned char* buf, int buf_size) {
266  VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
267  if (feof(h->inputFile_)) {
268  return AVERROR_EOF;
269  }
270  size_t ret = fread(buf, 1, buf_size, h->inputFile_);
271  if (ret < buf_size) {
272  if (ferror(h->inputFile_)) {
273  return -1;
274  }
275  }
276  return ret;
277  }
278 
279  static int64_t seekFile(void* opaque, int64_t offset, int whence) {
280  VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
281  switch (whence) {
282  case SEEK_CUR: // from current position
283  case SEEK_END: // from eof
284  case SEEK_SET: // from beginning of file
285  return fseek(h->inputFile_, static_cast<long>(offset), whence);
286  break;
287  case AVSEEK_SIZE:
288  int64_t cur = ftell(h->inputFile_);
289  fseek(h->inputFile_, 0L, SEEK_END);
290  int64_t size = ftell(h->inputFile_);
291  fseek(h->inputFile_, cur, SEEK_SET);
292  return size;
293  }
294 
295  return -1;
296  }
297 
298  static int readMemory(void* opaque, unsigned char* buf, int buf_size) {
299  VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
300  if (buf_size < 0) {
301  return -1;
302  }
303 
304  int reminder = h->inputBufferSize_ - h->offset_;
305  int r = buf_size < reminder ? buf_size : reminder;
306  if (r < 0) {
307  return AVERROR_EOF;
308  }
309 
310  memcpy(buf, h->inputBuffer_ + h->offset_, r);
311  h->offset_ += r;
312  return r;
313  }
314 
315  static int64_t seekMemory(void* opaque, int64_t offset, int whence) {
316  VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
317  switch (whence) {
318  case SEEK_CUR: // from current position
319  h->offset_ += offset;
320  break;
321  case SEEK_END: // from eof
322  h->offset_ = h->inputBufferSize_ + offset;
323  break;
324  case SEEK_SET: // from beginning of file
325  h->offset_ = offset;
326  break;
327  case AVSEEK_SIZE:
328  return h->inputBufferSize_;
329  }
330  return h->offset_;
331  }
332 
333  AVIOContext* get_avio() {
334  return ctx_;
335  }
336 
337  private:
338  int workBuffersize_;
339  DecodedFrame::AvDataPtr workBuffer_;
340  // for file mode
341  FILE* inputFile_;
342 
343  // for memory mode
344  const char* inputBuffer_;
345  int inputBufferSize_;
346  int offset_ = 0;
347 
348  AVIOContext* ctx_;
349 };
350 
351 struct VideoMeta {
352  double fps;
353  int width;
354  int height;
355  enum AVMediaType codec_type;
356  AVPixelFormat pixFormat;
357  VideoMeta()
358  : fps(-1),
359  width(-1),
360  height(-1),
361  codec_type(AVMEDIA_TYPE_VIDEO),
362  pixFormat(AVPixelFormat::AV_PIX_FMT_RGB24) {}
363 };
364 
366  public:
367  VideoDecoder();
368 
369  void decodeFile(
370  const std::string& filename,
371  const Params& params,
372  const int start_frm,
373  std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);
374 
375  void decodeMemory(
376  const char* buffer,
377  const int size,
378  const Params& params,
379  const int start_frm,
380  std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);
381 
382  private:
383  std::string ffmpegErrorStr(int result);
384 
385  void ResizeAndKeepAspectRatio(
386  const int origHeight,
387  const int origWidth,
388  const int heightMin,
389  const int widthMin,
390  int& outHeight,
391  int& outWidth);
392 
393  void decodeLoop(
394  const std::string& videoName,
395  VideoIOContext& ioctx,
396  const Params& params,
397  const int start_frm,
398  std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);
399 };
400 } // namespace caffe2
401 
402 #endif // CAFFE2_VIDEO_VIDEO_DECODER_H_
Params & outputHeight(int height)
Output frame height, default to video height.
Params & keyFrames(bool keyFrames)
Return all key-frames.
Params & outputWidth(int width)
Output frame width, default to video width.
Params & streamIndex(int index)
Index of video stream to process, defaults to the first video stream.
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Params & fps(float v)
FPS of output frames setting here will reset intervals_ and force decoding at target FPS This can be ...
Params & pixelFormat(AVPixelFormat pixelFormat)
Pixel format of output buffer, default PIX_FMT_RGB24.
Params & maxOutputFrames(int count)
Only output this many frames, default to no limit.