So I've got a bunch of files like the picture shows. I need to remove the plus sign in between each set of 8 characters, but the problem is that these are .fastq files, which are huge and have many copies of that string of characters. Is there a way to replace every instance (regardless of the letters) in each file? Some of these are 85mb + and I can't do just a find and replace in Word, since it crashes. Any help would be appreciated.
Here is the code that I'm using, I'm not very well-versed in programming/Linux. Code is in Python:
import os
import argparse
def get_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('dir',
help='The full path of the directory containing the original FASTQ Files')
parser.add_argument('keyFile',
help='The full path of the Key File containing a table with corresponding file names, sample names, and barcode sequences')
parser.add_argument('--out', help='the path to the directory where the modified FASTQ files will be stored; if none is provided, files are written to original directories')
args = parser.parse_args()
return args.dir, args.out, args.keyFile
def store_key_info(keyF):
try:
assert os.path.exists(keyF)
except AssertionError:
raise Exception('The Key File [%s] doesn\'t exist. Please check the location of the file.' % keyF)
f = open(keyF, 'rU')
f.readline()
kDict = {}
for line in f:
fName, sName, barcode = line.rstrip().split('\t')
kDict[barcode] = fName
return kDict
def find_all_files_in_dir_tree_with_ext(fqDir, extensionsList):
filesList = []
for root, dirnames, filenames in os.walk(fqDir):
for filename in filenames:
if filename.endswith(tuple(extensionsList)):
filesList.append(os.path.join(root, filename))
return filesList
def remove_ext_from_file_name(fName):
return os.path.splitext(fName)[0]
def identify_barcode_in_file_name(fName):
nameComponents = fName.split('_')
bc = nameComponents[-4].replace('-','')
return bc
def determine_new_fastq_name(fName, sampleID):
nameComps = fName.split('_')[-4:]
nameComps[0] = nameComps[0].replace('-', '')
nameComps.insert(0, sampleID)
nameComps.append('mod.fastq')
newFileName = '_'.join(nameComps)
return newFileName
def create_new_file(fName):
try:
f = open(fName, 'w')
except:
raise Exception('Something went wrong while trying to write new FASTQ files.')
return f
def replace_barcode_sequence(line, sID):
line = line.rstrip().split(':')
line[-1] = sID
line = ':'.join(line)
line += '\n'
return line
def modify_header(fName, sID, newFName):
# open old file
# open new file
newF = create_new_file(newFName)
oldF = open(fName, 'r')
for idx, row in enumerate(oldF):
if idx%4 == 0:
newHeader = replace_barcode_sequence(row, sID)
newF.write(newHeader)
else:
newF.write(row)
oldF.close()
newF.close()
return True
if __name__ == "__main__":
print("\n######################## Start ########################\n")
fqExt = ['fq', 'fastq']
oldDir, newDir, keyFile = get_arguments()
# check if a new directory was provided
writeToNewDir = False
if newDir is not None:
writeToNewDir = True
print("Modified FASTQ files will be written to a new directory [%s]." % newDir)
# check if the new directory exists, and if not, create it
if not os.path.exists(newDir):
os.makedirs(newDir)
else:
print("The modified FASTQ files will be written to the same directories as the original FASTQ files.")
# import sample-barcode key information
keyDict = store_key_info(keyFile)
print('I have imported the sample-barcode table; it contains %s barcodes.' % len(keyDict))
# look for FASTQ files in the user-specified directory
print('Looking for FASTQ files in [%s] ...' % oldDir)
fileList = find_all_files_in_dir_tree_with_ext(oldDir, fqExt)
print('A total of %s FASTQ files were found and are being modified.' % len(fileList))
usedBarcodes = []
shelvedBarcodes = []
for filePath in fileList:
fileParentDir, fileName = os.path.split(filePath)
oldFileName = remove_ext_from_file_name(fileName)
barcode = identify_barcode_in_file_name(oldFileName)
if barcode in keyDict:
usedBarcodes.append(barcode)
sampleID = keyDict[barcode]
newFileName = determine_new_fastq_name(oldFileName, sampleID)
# print(newFileName)
if writeToNewDir:
newFilePath = os.path.join(newDir, newFileName)
else:
newFilePath = os.path.join(fileParentDir, newFileName)
return_status = modify_header(filePath, sampleID, newFilePath)
if return_status:
print(' %s -- written' % (newFilePath))
else:
shelvedBarcodes.append(barcode)
print('Files have been modified.')
print('\n##################################\n')
usedBarcodes = list(set(usedBarcodes))
shelvedBarcodes = list(set(shelvedBarcodes))
print('Summary:')
print('Of the %s barcodes, %s were encountered in the files.' % (len(keyDict),len(usedBarcodes)))
print('Additionally, %s barcodes were encountered in the FASTQ files but were not found in the sample-barcode table.' % len(shelvedBarcodes))
print("\n######################## End ########################\n")
The highlighted area needs to be replaced:
EDIT: Was told to move an answer to the original post:Here are the first 100 lines of one such document. I need to change the 2:N:0: AAAAAAAA+GGGGGGGG section in all instances, in all files with the above code. I don't think the code will work with the + in the middle of them. I can't do a find and replace since some of these have nearly 100,000 reads which takes too long for my text editor to do (upwards of 8 hours), plus that 2:N:0 line isn't the same per read, so it's not 100%. The + sign between lines signals a new read, which should stay.
@M04574:62:000000000-BMHJ4:1:1101:20437:3142 2:N:0:GCTCATGA+ACTGCAAA
TNGTTTAAAGCCAGTACGTCAACATCCTAAGCGCAGATAAGGAGAAACAACCGACTGAAAGGGAACGGAGATAATCGAACGGAAGCTACGGGCACAAGAACAACATAAAGGGCAGAGAGGGGAACCGGAAAAGCTAACCGTAAAAAGAGACAAGAACGGGAAGCAGAAGCACCGGCGAGTAAAAAACGTAAGGAGGAGAAAGAGGCGACCACGACGGAACCAGCGGAAGAAAAACAAGATGGGTGCAAAAAGCAAACAGGGGGGAGAACAGTAAGAAGGCAAGAGGAGGCAGGGAGCGCGC
+
-#8---9;<@,,6,;@CC;EFBFGFGGGFG<FB@@@+,,6;,,;F,C,<@AFCC@+8C9<F8:D,C,,87@,,66CB,,,+,666B,:,,,4+48B,,,,:9,,9,:5A,,++++A=4+4B++8++++++3A8,,,3,3>8++8DF7+,,,6=,,,,3@+@B*@,54,,,@53*******,,511=3*2*//5=*/18=5C**)/*)07)/*)))))8)))))())01()((.7().(2*)).,-.))6.:((4(,(.,4((((073<04(.)))..,((.310((,41(-31(-1(((-(
@M04574:62:000000000-BMHJ4:1:1101:23391:4717 2:N:0:GCGCATGA+ACTGCAAA
TNGTTTAAAGCCATTATGCCAAGAGCGTAAGCATGTTACGTGGGCGAAACGCGACCAAAAAGACATAGTGAGTACATCGAGGGCAATGGCCGCATACGATAAAAGGATAAAACACACACGAAGAGGCCACAGACCTCAAAGCCTGAACCGACGGTCAAAATCGATGGTGAACCGGCAAAACAGGAAATGAAACAAGCAGAAGCAAGGCGGAAGACCAGAAGAGGCGAATGACTGCAAGCGGTGCACAGTCAACAAGGTAACGTACTGGTGGACGGTAGGTGCAAAGATAGTTTCATCGTGA
+
8#88-66@@A,,666;,;=,CC,C,,+78;C6666C,C,,,6,+,6+6++,8C7C@C,C<CF,:6:,,<C<E,,,,,6BE,++,:?:,4B,@+:8B9BF+CF,C54,,:,9A<A,AC++@FFEE,=43=+D,,,83AFFE,8>,,6,6>BECEC6*>:=;,,436@*37><7:;*1:CE,4ACCCCF,<,=+,4:C4C00;98ACC7)11/7))0)770<9<)4)57)(0--*1:77)(0(((()9*(-)..)58,()-))-(.)--,61:3(((,((-.).4).4()--)).)6))4(((
@M04574:62:000000000-BMHJ4:1:1101:18653:4992 2:N:0:GCTCATGA+ACTGCATA
TNGTTTAAAGCCATTACGTCATGGAGGATAACATGATTCATTGAAATTTCCCTCCTCGTTGAATCGGTTCCTACCCGCTTTGCCAGCCACACCGGCAGAGCCACCGGGCCATGGGAACAACTTTCAGCAACCAGCAGCCGCAACAGGGGCAATGCCTGGTCCATACGTGCCTGCTGCCACGAGCAATGCATCCGCCTGGCGTCCCCACGGAATGTCTCATCAGCCGTTGGCCATAGAATACCACTTCCGAGCACCCCTGGTCCTCAAGAACCGCTGTGAGCCTAAAACGGGAAAGGACAAA
+
-#86-9C<<E9C@;@,CC;EBEE<C<,6;C<FFG<@E,6CD,C<DE,6C@FEEC@F,BC,@@<6@F@8@@CCFG86@@CF7@@,BF?FFF@,6C:=FFDEG,BFGDC@FGGBF,DE49AEF8,9EE,CEDF<C=FEGG:C74BFGEDCE7+@,@8D,383@@EC=C:@D:87@@,@C:DFC>*@5D8,@<==C*1*3;D*8*)01;*:CCDGFF6C+13:A=63A>ADFFF@F)7ACE)@<>FF7=?7BF>F(3((--3:(:))9.).48(441((4(6<7((),.8848(4(,82(380:
@M04574:62:000000000-BMHJ4:1:1101:8566:5003 2:N:0:GGTCATGA+ACTGCTTA
TNGTTTAAAGCCATTACGTCAACATCCTCTGCCTCTTTTATACATCTTACTTTGCTTACGTACTGCTGCGGTCTTCTCTCTTTTTTACCCTTTTCTTTTAGTTTACAAACCCAGTTCACAACTTTCTCACCTTTCTGTCCCGTTTTCCCTCTCATCAAGCTTCTATATACTACTTACTCGTAGGCCGAACTCTTCCTCCACTGATTTCTTTATTATCTCCAATACTGACTTCGATCCTTCCCCTTTTGACACTTTCCCGTACTGTTTTGCTCCCATTTCACGGTTCTTTACCTCTTTCCCC
Instead of image, could you please post text / link to few example records? IMO, awk can address this much easier way (replacing the highlighted characters, in the example image, for every record in fastq file).
If you just need to remove the part after
2:xxxx
doCan you comment on why you are doing this? That plus sign indicates the two indexes that were originally used to tag that sample.
GCTCATGA (index 1) + ACTGCAAA (index2)
.