Question

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte

0

Entering edit mode

13 months ago

wangjincheng • 0

I use python package to load and analysis vcf.gz files,

for ch in chs:
    vcf_to_1240K_hdf(in_vcf_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/All_data/Analysis4/19.genotype_imputation/GLIMPSE_test/GLIMPSE_ligated/merged_chr{ch}.vcf.gz",
                     path_vcf = f"/mnt/gpfs/Users/wangjincheng/aDNA/All_data/Analysis4/19.genotype_imputation/GLIMPSE_test/ancIBD/chr{ch}.vcf",
                     path_h5 = f"/mnt/gpfs/Users/wangjincheng/aDNA/All_data/Analysis4/19.genotype_imputation/GLIMPSE_test/ancIBD/chr{ch}.h5",
                     marker_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/reference/data/filters/snps_bcftools_ch{ch}.csv",
                     map_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/reference/data/afs/v51.1_1240k.snp",
                     af_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/reference/data/afs/v51.1_1240k_AF_ch{ch}.tsv",
                     col_sample_af = "",
                     buffer_size=20000, chunk_width=8, chunk_length=20000,
                     ch=ch)

but get this error below:

UnicodeDecodeError                        Traceback (most recent call last)
Cell In[14], line 6
      3 chs = range(1,23)
      5 for ch in chs:
----> 6     vcf_to_1240K_hdf(in_vcf_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/All_data/Analysis4/19.genotype_imputation/GLIMPSE_test/GLIMPSE_ligated/merged_chr{ch}.vcf.gz",
      7                      path_vcf = f"/mnt/gpfs/Users/wangjincheng/aDNA/All_data/Analysis4/19.genotype_imputation/GLIMPSE_test/ancIBD/chr{ch}.vcf",
      8                      path_h5 = f"/mnt/gpfs/Users/wangjincheng/aDNA/All_data/Analysis4/19.genotype_imputation/GLIMPSE_test/ancIBD/chr{ch}.h5",
      9                      marker_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/reference/data/filters/snps_bcftools_ch{ch}.csv",
     10                      map_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/reference/data/afs/v51.1_1240k.snp",
     11                      af_path = f"/mnt/gpfs/Users/wangjincheng/aDNA/reference/data/afs/v51.1_1240k_AF_ch{ch}.tsv",
     12                      col_sample_af = "",
     13                      buffer_size=20000, chunk_width=8, chunk_length=20000,
     14                      ch=ch)

File /mnt/gpfs/Users/wangjincheng/software/miniconda3_new/lib/python3.9/site-packages/ancIBD/IO/prepare_h5.py:116, in vcf_to_1240K_hdf(in_vcf_path, path_vcf, path_h5, marker_path, map_path, af_path, col_sample_af, chunk_length, chunk_width, buffer_size, ch)
    113     os.remove(path_h5)
    115 print("Converting to HDF5...")
--> 116 allel.vcf_to_hdf5(input=path_vcf, output=path_h5, 
    117                   fields = ['variants/*', 'calldata/*', "samples"], 
    118                   types = {"samples":"S60", "calldata/GT":np.int8,
    119                            "calldata/GP":np.float32, "calldata/PL":np.float32}, 
    120                   buffer_size=buffer_size,
    121                   chunk_length = chunk_length, chunk_width=chunk_width,
    122                   compression="gzip") # Do the conversion to hdf5. Takes hours
    123 print("Finished conversion to hdf5!")
    125 print("Merging in LD Map..")

File /mnt/gpfs/Users/wangjincheng/software/miniconda3_new/lib/python3.9/site-packages/allel/io/vcf_read.py:693, in vcf_to_hdf5(input, output, group, compression, compression_opts, shuffle, overwrite, vlen, fields, exclude_fields, rename_fields, types, numbers, alt_number, fills, region, tabix, samples, transformers, buffer_size, chunk_length, chunk_width, log)
    690 store_samples, fields = _prep_fields_param(fields)
    692 # setup chunk iterator
--> 693 fields, samples, headers, it = iter_vcf_chunks(
    694     input, fields=fields, exclude_fields=exclude_fields, types=types,
    695     numbers=numbers, alt_number=alt_number, buffer_size=buffer_size,
    696     chunk_length=chunk_length, fills=fills, region=region, tabix=tabix,
    697     samples=samples, transformers=transformers
    698 )
    700 # handle field renaming
    701 if rename_fields:

File /mnt/gpfs/Users/wangjincheng/software/miniconda3_new/lib/python3.9/site-packages/allel/io/vcf_read.py:1138, in iter_vcf_chunks(input, fields, exclude_fields, types, numbers, alt_number, fills, region, tabix, samples, transformers, buffer_size, chunk_length)
   1134 stream = _setup_input_stream(input=input, region=region, tabix=tabix,
   1135                              buffer_size=buffer_size)
   1137 # setup iterator
-> 1138 fields, samples, headers, it = _iter_vcf_stream(stream, **kwds)
   1140 # setup transformers
   1141 if transformers is not None:
   1142     # API flexibility

File /mnt/gpfs/Users/wangjincheng/software/miniconda3_new/lib/python3.9/site-packages/allel/io/vcf_read.py:1636, in _iter_vcf_stream(stream, fields, exclude_fields, types, numbers, alt_number, chunk_length, fills, region, samples)
   1632 def _iter_vcf_stream(stream, fields, exclude_fields, types, numbers, alt_number,
   1633                      chunk_length, fills, region, samples):
   1634 
   1635     # read VCF headers
-> 1636     headers = _read_vcf_headers(stream)
   1638     # setup samples
   1639     samples, loc_samples = _normalize_samples(samples=samples, headers=headers,
   1640                                               types=types)

File /mnt/gpfs/Users/wangjincheng/software/miniconda3_new/lib/python3.9/site-packages/allel/io/vcf_read.py:1711, in _read_vcf_headers(stream)
   1709 # read first header line
   1710 header = stream.readline()
-> 1711 header = str(header, 'utf8')
   1713 while header and header[0] == '#':
   1715     headers.append(header)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte

How can I solve this problem, thanks a lot !

vcf.gz UnicodeDecodeError ancIBD • 1.4k views

ADD COMMENT • link 13 months ago by wangjincheng • 0

0

Entering edit mode

use python package to load and analysis vcf.gz files

looks like it doesn't like gzipped file. Are you sure this software can read gzipped files ?