A nice feature of bgen-reader is that it will create an index for the bgen file. This means that while the first 'open' might take 10 minutes, every 'open after that is instantaneous. Also, reads from just one SNP then take less than 1/10 of a second, even on giant files.
Here is what we came up with:
import numpy as np
import pandas as pd
from bgen_reader import example_filepath, open_bgen
# bgen-reader docs: https://bgen-reader.readthedocs.io/en/latest/numpyapi.html
# Use this sample small file or download the large sample file
# example_filepath("haplotypes.bgen") # little sample file
# snp = "RS3"
# Download from https://www.dropbox.com/sh/vuzozn39vsw8zcl/AAAZT2aRMB3V8kdz6CzlmdY-a?dl=0
file = r"c:\deldir\merged_487400x4840000.bgen" # large file
snp = "sid_5_120000"
# The first time you open the file, it will be indexed and a .metadata2.mmm file will be created.
# This will take about as long as a file copy. About 10 minutes for the big file on my very fast SSD.
# Subsequent openings will be instantaneous.
bgen = open_bgen(file, verbose=False)
variant_index = bgen.rsids==snp
p = bgen.read((variant_index)) # read a single variant
print(p.shape) # => (487400, 1, 3)
print(bgen.allele_ids[variant_index]) # for example ['A,C']
# probabilities for AA,AC,CC
df1 = pd.DataFrame({'sample':bgen.samples,'0':p[:,0,0],'1':p[:,0,1],'2':p[:,0,2]})
del bgen # close the file
print(df1.head())
https://github.com/CarlKCarlK