Can't correctly decode an image frame using PyAV

921 Views Asked by At

I'm trying to simply encode and decode a capture frame from the web-cam. I want to be able to send this over TCP but at the moment I'm having trouble performing this just locally.

Here's my code that simply takes the frame from the web-cam, encodes, then decodes, and displays the two images in a new window. The two images look like this:

1

Here's the code:

import struct
import cv2
import socket
import av
import time
import os

class PerfTimer:
    def __init__(self, name):
        self.name = name

    def __enter__(self):
        self.start_time = time.perf_counter()

    def __exit__(self, type, value, traceback):
        end_time = time.perf_counter()
        print(f"'{self.name}' taken:", end_time - self.start_time, "seconds.")

os.environ['AV_PYTHON_AVISYNTH'] = 'C:/ffmpeg/bin'

socket_enabled = False
sock = None
if socket_enabled:
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    print("Connecting to server...")
    sock.connect(('127.0.0.1', 8000))

# Set up video capture.
print("Opening web cam...")
cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 800)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 600)

# Initialize the encoder.
encoder = av.CodecContext.create('h264', 'w')
encoder.width = 800
encoder.height = 600
encoder.pix_fmt = 'yuv420p'
encoder.bit_rate = 5000

# Initialize the decoder.
decoder = av.CodecContext.create('h264', 'r')
decoder.width = 800
decoder.height = 600
decoder.pix_fmt = 'yuv420p'
decoder.bit_rate = 5000

print("Streaming...")
while(cap.isOpened()):
    
    # Capture the frame from the camera.
    ret, orig_frame = cap.read()

    cv2.imshow('Source Video', orig_frame)

    # Convert to YUV.
    img_yuv = cv2.cvtColor(orig_frame, cv2.COLOR_BGR2YUV_I420)

    # Create a video frame object from the num py array.
    video_frame = av.VideoFrame.from_ndarray(img_yuv, format='yuv420p')

    with PerfTimer("Encoding") as p:
        encoded_frames = encoder.encode(video_frame)

    # Sometimes the encode results in no frames encoded, so lets skip the frame.
    if len(encoded_frames) == 0:
        continue

    print(f"Decoding {len(encoded_frames)} frames...")

    for frame in encoded_frames:
        encoded_frame_bytes = bytes(frame)

        if socket_enabled:
            # Get the size of the encoded frame in bytes
            size = struct.pack('<L', len(encoded_frame_bytes))
            sock.sendall(size + encoded_frame_bytes)

        # Step 1: Create the packet from the frame.
        packet = av.packet.Packet(frame)

        # Step 2: Decode the packet.
        decoded_packets = decoder.decode(packet)

        for packet in decoded_packets:
            # Step 3: Convert the pixel format from the encoder color format to BGR for displaying.
            frame = cv2.cvtColor(packet.to_ndarray(format='yuv420p'), cv2.COLOR_YUV2BGR_I420)

            # Step 4. Display frame in window.
            cv2.imshow('Decoded Video', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# release everything
cap.release()
sock.close()
cv2.destroyAllWindows()
2

There are 2 best solutions below

5
Christoph Rackwitz On

A bitrate is given in bits. 5000 bits (per second) is very little.

Give it 5000000 bits/sec and see what happens.

The naming in your code is misleading. Your "decoded packets" aren't packets but frames.

0
Rotem On

As answered by Christoph, the main issue is the low bit_rate.

The bit_rate setting is meaningless if we don't set the framerate.
The average number of bits per frame is the bitrate divided by the framerate, so the encoder must know the framerate for setting the destination bits per frame.

Without setting the framerate there is a warning message:

MB rate (1900000000) > level limit (16711680)


We have to set the bitrate and the framerate (example):

encoder.bit_rate = 5000000
encoder.framerate = 25

For making a reproducible code sample, we may use synthetic video frames, instead of grabbing frames from the camera (it helps for testing).

I tried to use the correct terminology (packets versus frames issue) as mentioned by Christoph.

The code sample includes some suggestions in the comments:

import struct
import cv2
import socket
import av
import time
import os
import numpy as np


def make_sample_image(i, width, height):
    """ Build synthetic "raw BGR" image for testing """
    p = width//60
    img = np.full((height, width, 3), 60, np.uint8)
    cv2.putText(img, str(i+1), (width//2-p*10*len(str(i+1)), height//2+p*10), cv2.FONT_HERSHEY_DUPLEX, p, (255, 30, 30), p*2)  # Blue number
    return img


class PerfTimer:
    def __init__(self, name):
        self.name = name

    def __enter__(self):
        self.start_time = time.perf_counter()

    def __exit__(self, type, value, traceback):
        end_time = time.perf_counter()
        print(f"'{self.name}' taken:", end_time - self.start_time, "seconds.")

#os.environ['AV_PYTHON_AVISYNTH'] = 'C:/ffmpeg/bin'  # We are not using FFmpeg CLI

socket_enabled = False
sock = None
if socket_enabled:
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    print("Connecting to server...")
    sock.connect(('127.0.0.1', 8000))

# Set up video capture.
#print("Opening web cam...")
#cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)  # We are going to use make_sample_image instead of cap.read()
#cap.set(cv2.CAP_PROP_FRAME_WIDTH, 800)
#cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 600)

# Initialize the encoder.
encoder = av.CodecContext.create('h264', 'w')
encoder.width = 800
encoder.height = 600
encoder.pix_fmt = 'yuv420p'
encoder.bit_rate = 5000000
encoder.framerate = 25  # Set the framerate - the bitrate is not meaningful without a framerate (or without frame period).
#encoder.options = {'tune': 'zerolatency'}  # Consider adding "-tune zerolatency" for reducing the latency.
#encoder.open()

# Initialize the decoder.
decoder = av.CodecContext.create('h264', 'r')
#decoder.width = 800  # There is no need to set width, height, pix_fmt, bit_rate for the decoder.
#decoder.height = 600
#decoder.pix_fmt = 'yuv420p'
#decoder.bit_rate = 5000
#decoder.open()

print("Streaming...")
#while (cap.isOpened()):    
for i in range(1000000):
    # Capture the frame from the camera.
    #ret, orig_frame = cap.read()

    # Use synthetic frame for testing.
    orig_frame = make_sample_image(i, encoder.width, encoder.height)

    cv2.imshow('Source Video', orig_frame)

    # Convert to YUV.
    img_yuv = cv2.cvtColor(orig_frame, cv2.COLOR_BGR2YUV_I420)

    # Create a video frame object from the NumPy array.
    video_frame = av.VideoFrame.from_ndarray(img_yuv, format='yuv420p')
    #video_frame = av.VideoFrame.from_ndarray(orig_frame, format='bgr24')  # BGR is also supported...

    #video_frame.pts = i  # We may want to set the timestamps
    #video_frame.time_base = encoder.time_base

    #with PerfTimer("Encoding") as p:
    #    encoded_frames = encoder.encode(video_frame)
    encoded_packet = encoder.encode(video_frame)  # The correct terminology is "encoded_packet".

    # Sometimes the encode results in no frames encoded, so lets skip the frame.
    if len(encoded_packet) == 0:
        continue

    print(f"Decoding {len(encoded_packet)} packets...")

    #for frame in encoded_packet:  # No need to use a for loop - there is always going to be only one encoded packet.
    #encoded_frame_bytes = bytes(frame)
    encoded_packet_bytes = bytes(encoded_packet[0])

    if socket_enabled:
        # Get the size of the encoded frame in bytes
        size = struct.pack('<L', len(encoded_packet_bytes))
        sock.sendall(size + encoded_packet_bytes)

    # Step 1: Create the packet from the frame.
    #packet = av.packet.Packet(frame)

    # Step 1: Create the packet from the "bytes".
    packet = av.packet.Packet(encoded_packet_bytes)

    # Step 2: Decode the packet.
    #decoded_packets = decoder.decode(packet)
    decoded_video_frames = decoder.decode(packet)  # After decoding, the terminology is "decoded_frames"

    if len(decoded_video_frames) > 0:
        # Step 3: Convert the pixel format from the encoder color format to BGR for displaying.
        decoded_video_frame = decoded_video_frames[0]
        decoded_frame = decoded_video_frame.to_ndarray(format='yuv420p')
        frame = cv2.cvtColor(decoded_frame, cv2.COLOR_YUV2BGR_I420)
        #frame = decoded_video_frame.to_ndarray(format='bgr24')  # BRG is also supported...

        # Step 4. Display frame in window.
        cv2.imshow('Decoded Video', frame)

    if cv2.waitKey(100) & 0xFF == ord('q'):
        break

# release everything
#cap.release()
if socket_enabled:
    sock.close()
cv2.destroyAllWindows()