av_writer.cpp 14.5 KB
#include "av_writer.h"
#include <android/log.h>
#include <cstring>

#define LOG_TAG "AVWriter"
#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
#define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__)

// AVI format constants
#define FOURCC(a,b,c,d) ((uint32_t)(a) | ((uint32_t)(b) << 8) | ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24))

static const uint32_t RIFF_FOURCC = FOURCC('R','I','F','F');
static const uint32_t AVI_FOURCC = FOURCC('A','V','I',' ');
static const uint32_t LIST_FOURCC = FOURCC('L','I','S','T');
static const uint32_t HDRL_FOURCC = FOURCC('h','d','r','l');
static const uint32_t AVIH_FOURCC = FOURCC('a','v','i','h');
static const uint32_t STRL_FOURCC = FOURCC('s','t','r','l');
static const uint32_t STRH_FOURCC = FOURCC('s','t','r','h');
static const uint32_t STRF_FOURCC = FOURCC('s','t','r','f');
static const uint32_t MOVI_FOURCC = FOURCC('m','o','v','i');
static const uint32_t VIDS_FOURCC = FOURCC('v','i','d','s');
static const uint32_t AUDS_FOURCC = FOURCC('a','u','d','s');
static const uint32_t DIB_FOURCC = FOURCC('D','I','B',' ');
static const uint32_t PCM_FOURCC = FOURCC('P','C','M',' ');
static const uint32_t DC00_FOURCC = FOURCC('0','0','d','c');
static const uint32_t WB01_FOURCC = FOURCC('0','1','w','b');

AVWriter::AVWriter()
    : is_open_(false)
    , width_(0)
    , height_(0)
    , fps_(30)
    , video_frame_count_(0)
    , audio_enabled_(false)
    , sample_rate_(44100)
    , channels_(1)
    , audio_sample_count_(0)
    , total_video_size_(0)
    , total_audio_size_(0)
{
}

AVWriter::~AVWriter() {
    close();
}

bool AVWriter::open(const std::string& filename, int width, int height, int fps, 
                   bool enableAudio, int sampleRate, int channels) {
    if (is_open_) {
        close();
    }

    filename_ = filename;
    width_ = width;
    height_ = height;
    fps_ = fps;
    audio_enabled_ = enableAudio;
    sample_rate_ = sampleRate;
    channels_ = channels;
    video_frame_count_ = 0;
    audio_sample_count_ = 0;
    total_video_size_ = 0;
    total_audio_size_ = 0;

    // Change extension to .avi
    std::string avi_filename = filename;
    size_t pos = avi_filename.find_last_of('.');
    if (pos != std::string::npos) {
        avi_filename = avi_filename.substr(0, pos) + ".avi";
    }

    file_.open(avi_filename, std::ios::binary);
    if (!file_.is_open()) {
        LOGE("Failed to open file: %s", avi_filename.c_str());
        return false;
    }

    writeAVIHeader();
    is_open_ = true;
    
    LOGI("Opened AV file: %s (%dx%d @ %dfps) Audio: %s", 
         avi_filename.c_str(), width, height, fps, 
         audio_enabled_ ? "ON" : "OFF");
    return true;
}

bool AVWriter::writeVideoFrame(const cv::Mat& frame) {
    if (!is_open_ || frame.empty()) {
        return false;
    }

    writeVideoFrameInternal(frame);
    video_frame_count_++;
    return true;
}

bool AVWriter::writeAudioData(const std::vector<short>& audioData) {
    if (!is_open_ || !audio_enabled_ || audioData.empty()) {
        return false;
    }

    writeAudioChunk(audioData);
    audio_sample_count_ += audioData.size();
    return true;
}

void AVWriter::writeAVIHeader() {
    // Calculate frame size
    int frame_size = width_ * height_ * 3; // RGB24
    int microsec_per_frame = 1000000 / fps_;
    
    // RIFF header
    file_.write("RIFF", 4);
    file_size_pos_ = file_.tellp();
    uint32_t file_size = 0; // Will be updated later
    file_.write(reinterpret_cast<const char*>(&file_size), 4);
    file_.write("AVI ", 4);
    
    // LIST hdrl
    file_.write("LIST", 4);
    uint32_t hdrl_size = audio_enabled_ ? 308 : 244; // Approximate size
    file_.write(reinterpret_cast<const char*>(&hdrl_size), 4);
    file_.write("hdrl", 4);
    
    // avih (main AVI header)
    file_.write("avih", 4);
    uint32_t avih_size = 56;
    file_.write(reinterpret_cast<const char*>(&avih_size), 4);
    
    uint32_t microsec_per_frame_val = microsec_per_frame;
    file_.write(reinterpret_cast<const char*>(&microsec_per_frame_val), 4);
    uint32_t max_bytes_per_sec = frame_size * fps_;
    file_.write(reinterpret_cast<const char*>(&max_bytes_per_sec), 4);
    uint32_t padding_granularity = 0;
    file_.write(reinterpret_cast<const char*>(&padding_granularity), 4);
    uint32_t flags = 0x10; // AVIF_HASINDEX
    file_.write(reinterpret_cast<const char*>(&flags), 4);
    
    video_frames_pos_ = file_.tellp();
    uint32_t total_frames = 0; // Will be updated later
    file_.write(reinterpret_cast<const char*>(&total_frames), 4);
    uint32_t initial_frames = 0;
    file_.write(reinterpret_cast<const char*>(&initial_frames), 4);
    uint32_t streams = audio_enabled_ ? 2 : 1;
    file_.write(reinterpret_cast<const char*>(&streams), 4);
    uint32_t suggested_buffer_size = frame_size;
    file_.write(reinterpret_cast<const char*>(&suggested_buffer_size), 4);
    uint32_t width = width_;
    file_.write(reinterpret_cast<const char*>(&width), 4);
    uint32_t height = height_;
    file_.write(reinterpret_cast<const char*>(&height), 4);
    uint32_t reserved[4] = {0, 0, 0, 0};
    file_.write(reinterpret_cast<const char*>(reserved), 16);
    
    // Video stream header
    file_.write("LIST", 4);
    uint32_t strl_size = 116;
    file_.write(reinterpret_cast<const char*>(&strl_size), 4);
    file_.write("strl", 4);
    
    // strh (stream header)
    file_.write("strh", 4);
    uint32_t strh_size = 56;
    file_.write(reinterpret_cast<const char*>(&strh_size), 4);
    file_.write("vids", 4); // fccType
    file_.write("DIB ", 4); // fccHandler
    uint32_t stream_flags = 0;
    file_.write(reinterpret_cast<const char*>(&stream_flags), 4);
    uint16_t priority = 0;
    file_.write(reinterpret_cast<const char*>(&priority), 2);
    uint16_t language = 0;
    file_.write(reinterpret_cast<const char*>(&language), 2);
    uint32_t initial_frames_stream = 0;
    file_.write(reinterpret_cast<const char*>(&initial_frames_stream), 4);
    uint32_t scale = 1;
    file_.write(reinterpret_cast<const char*>(&scale), 4);
    uint32_t rate = fps_;
    file_.write(reinterpret_cast<const char*>(&rate), 4);
    uint32_t start = 0;
    file_.write(reinterpret_cast<const char*>(&start), 4);
    video_length_pos_ = file_.tellp(); // Save position to update later
    uint32_t length = 0; // Will be updated later
    file_.write(reinterpret_cast<const char*>(&length), 4);
    uint32_t suggested_buffer_size_stream = frame_size;
    file_.write(reinterpret_cast<const char*>(&suggested_buffer_size_stream), 4);
    uint32_t quality = 0;
    file_.write(reinterpret_cast<const char*>(&quality), 4);
    uint32_t sample_size = 0;
    file_.write(reinterpret_cast<const char*>(&sample_size), 4);
    uint16_t left = 0, top = 0, right = width_, bottom = height_;
    file_.write(reinterpret_cast<const char*>(&left), 2);
    file_.write(reinterpret_cast<const char*>(&top), 2);
    file_.write(reinterpret_cast<const char*>(&right), 2);
    file_.write(reinterpret_cast<const char*>(&bottom), 2);
    
    // strf (stream format)
    file_.write("strf", 4);
    uint32_t strf_size = 40;
    file_.write(reinterpret_cast<const char*>(&strf_size), 4);
    
    // BITMAPINFOHEADER
    uint32_t bi_size = 40;
    file_.write(reinterpret_cast<const char*>(&bi_size), 4);
    int32_t bi_width = width_;
    file_.write(reinterpret_cast<const char*>(&bi_width), 4);
    int32_t bi_height = height_;
    file_.write(reinterpret_cast<const char*>(&bi_height), 4);
    uint16_t bi_planes = 1;
    file_.write(reinterpret_cast<const char*>(&bi_planes), 2);
    uint16_t bi_bit_count = 24;
    file_.write(reinterpret_cast<const char*>(&bi_bit_count), 2);
    uint32_t bi_compression = 0; // BI_RGB
    file_.write(reinterpret_cast<const char*>(&bi_compression), 4);
    uint32_t bi_size_image = frame_size;
    file_.write(reinterpret_cast<const char*>(&bi_size_image), 4);
    int32_t bi_x_pels_per_meter = 0;
    file_.write(reinterpret_cast<const char*>(&bi_x_pels_per_meter), 4);
    int32_t bi_y_pels_per_meter = 0;
    file_.write(reinterpret_cast<const char*>(&bi_y_pels_per_meter), 4);
    uint32_t bi_clr_used = 0;
    file_.write(reinterpret_cast<const char*>(&bi_clr_used), 4);
    uint32_t bi_clr_important = 0;
    file_.write(reinterpret_cast<const char*>(&bi_clr_important), 4);
    
    // Audio stream header (if enabled)
    if (audio_enabled_) {
        file_.write("LIST", 4);
        uint32_t audio_strl_size = 92;
        file_.write(reinterpret_cast<const char*>(&audio_strl_size), 4);
        file_.write("strl", 4);
        
        // Audio strh
        file_.write("strh", 4);
        uint32_t audio_strh_size = 56;
        file_.write(reinterpret_cast<const char*>(&audio_strh_size), 4);
        file_.write("auds", 4); // fccType
        uint32_t audio_handler = 0;
        file_.write(reinterpret_cast<const char*>(&audio_handler), 4);
        uint32_t audio_stream_flags = 0;
        file_.write(reinterpret_cast<const char*>(&audio_stream_flags), 4);
        uint16_t audio_priority = 0;
        file_.write(reinterpret_cast<const char*>(&audio_priority), 2);
        uint16_t audio_language = 0;
        file_.write(reinterpret_cast<const char*>(&audio_language), 2);
        uint32_t audio_initial_frames = 0;
        file_.write(reinterpret_cast<const char*>(&audio_initial_frames), 4);
        uint32_t audio_scale = 1;
        file_.write(reinterpret_cast<const char*>(&audio_scale), 4);
        uint32_t audio_rate = sample_rate_;
        file_.write(reinterpret_cast<const char*>(&audio_rate), 4);
        uint32_t audio_start = 0;
        file_.write(reinterpret_cast<const char*>(&audio_start), 4);
        
        audio_samples_pos_ = file_.tellp();
        uint32_t audio_length = 0; // Will be updated later
        file_.write(reinterpret_cast<const char*>(&audio_length), 4);
        uint32_t audio_suggested_buffer_size = sample_rate_ * channels_ * 2; // 1 second buffer
        file_.write(reinterpret_cast<const char*>(&audio_suggested_buffer_size), 4);
        uint32_t audio_quality = 0;
        file_.write(reinterpret_cast<const char*>(&audio_quality), 4);
        uint32_t audio_sample_size = channels_ * 2; // 16-bit samples
        file_.write(reinterpret_cast<const char*>(&audio_sample_size), 4);
        uint32_t audio_reserved[2] = {0, 0};
        file_.write(reinterpret_cast<const char*>(audio_reserved), 8);
        
        // Audio strf (WAVEFORMATEX)
        file_.write("strf", 4);
        uint32_t audio_strf_size = 16;
        file_.write(reinterpret_cast<const char*>(&audio_strf_size), 4);
        
        uint16_t format_tag = 1; // PCM
        file_.write(reinterpret_cast<const char*>(&format_tag), 2);
        uint16_t audio_channels = channels_;
        file_.write(reinterpret_cast<const char*>(&audio_channels), 2);
        uint32_t samples_per_sec = sample_rate_;
        file_.write(reinterpret_cast<const char*>(&samples_per_sec), 4);
        uint32_t avg_bytes_per_sec = sample_rate_ * channels_ * 2;
        file_.write(reinterpret_cast<const char*>(&avg_bytes_per_sec), 4);
        uint16_t block_align = channels_ * 2;
        file_.write(reinterpret_cast<const char*>(&block_align), 2);
        uint16_t bits_per_sample = 16;
        file_.write(reinterpret_cast<const char*>(&bits_per_sample), 2);
    }
    
    // LIST movi
    file_.write("LIST", 4);
    movi_size_pos_ = file_.tellp();
    uint32_t movi_size = 0; // Will be updated later
    file_.write(reinterpret_cast<const char*>(&movi_size), 4);
    file_.write("movi", 4);
    
    movi_list_pos_ = file_.tellp();
}

void AVWriter::writeVideoFrameInternal(const cv::Mat& frame) {
    if (frame.empty()) return;
    
    // Convert frame to BGR if needed and flip vertically (AVI requirement)
    cv::Mat bgr_frame;
    if (frame.channels() == 3) {
        cv::flip(frame, bgr_frame, 0); // Flip vertically
    } else if (frame.channels() == 4) {
        cv::Mat temp;
        cv::cvtColor(frame, temp, cv::COLOR_RGBA2BGR);
        cv::flip(temp, bgr_frame, 0);
    } else {
        LOGE("Unsupported frame format");
        return;
    }
    
    // Resize if necessary
    if (bgr_frame.cols != width_ || bgr_frame.rows != height_) {
        cv::resize(bgr_frame, bgr_frame, cv::Size(width_, height_));
    }
    
    // Write video chunk
    file_.write("00dc", 4); // Video chunk ID
    uint32_t chunk_size = bgr_frame.total() * bgr_frame.elemSize();
    file_.write(reinterpret_cast<const char*>(&chunk_size), 4);
    file_.write(reinterpret_cast<const char*>(bgr_frame.data), chunk_size);
    
    // Pad to even boundary
    if (chunk_size % 2 == 1) {
        char pad = 0;
        file_.write(&pad, 1);
    }
    
    total_video_size_ += chunk_size + 8 + (chunk_size % 2);
}

void AVWriter::writeAudioChunk(const std::vector<short>& audioData) {
    if (audioData.empty()) return;
    
    // Write audio chunk
    file_.write("01wb", 4); // Audio chunk ID
    uint32_t chunk_size = audioData.size() * sizeof(short);
    file_.write(reinterpret_cast<const char*>(&chunk_size), 4);
    file_.write(reinterpret_cast<const char*>(audioData.data()), chunk_size);
    
    // Pad to even boundary
    if (chunk_size % 2 == 1) {
        char pad = 0;
        file_.write(&pad, 1);
    }
    
    total_audio_size_ += chunk_size + 8 + (chunk_size % 2);
}

void AVWriter::close() {
    if (!is_open_) {
        return;
    }
    
    finalize();
    file_.close();
    is_open_ = false;
    
    LOGI("Closed AV file: %s (%d video frames, %d audio samples)", 
         filename_.c_str(), video_frame_count_, audio_sample_count_);
}

void AVWriter::finalize() {
    updateHeaders();
}

void AVWriter::updateHeaders() {
    std::streampos current_pos = file_.tellp();
    
    // Update file size
    file_.seekp(file_size_pos_);
    uint32_t file_size = static_cast<uint32_t>(current_pos) - 8;
    file_.write(reinterpret_cast<const char*>(&file_size), 4);
    
    // Update total frames
    file_.seekp(video_frames_pos_);
    uint32_t total_frames = video_frame_count_;
    file_.write(reinterpret_cast<const char*>(&total_frames), 4);
    
    // Update video stream length (critical for correct duration calculation)
    file_.seekp(video_length_pos_);
    uint32_t video_length = video_frame_count_;
    file_.write(reinterpret_cast<const char*>(&video_length), 4);
    
    // Update audio samples if audio is enabled
    if (audio_enabled_) {
        file_.seekp(audio_samples_pos_);
        uint32_t audio_length = audio_sample_count_;
        file_.write(reinterpret_cast<const char*>(&audio_length), 4);
    }
    
    // Update movi size
    file_.seekp(movi_size_pos_);
    uint32_t movi_size = total_video_size_ + total_audio_size_ + 4; // +4 for "movi"
    file_.write(reinterpret_cast<const char*>(&movi_size), 4);
    
    // Restore position
    file_.seekp(current_pos);
}