To follow up on Devon Ryan's answer, I thought it would be a little fun to write a Python script that demonstrates using a bit array to maintain a presence/absence table.
Note: I wrote a C++ port that includes a custom bitset implementation that can be sized at runtime. This and the Python script are available on Github: https://github.com/alexpreynolds/kmer-boolean
#!/usr/bin/env python
import sys
import os
import bitarray
# read FASTA
def read_sequences():
global seqs
seqs = []
seq = ""
for line in sys.stdin:
if line.startswith('>'):
if len(seq) > 0:
seqs.append(seq)
seq = ""
else:
seq += line.strip()
seqs.append(seq)
# build and initialize bit array
def initialize_bitarray():
global ba
ba = bitarray.bitarray(4**k)
ba.setall(False)
sys.stderr.write("Memory usage of bitarray.bitarray instance is [%ld] bytes\n" % (ba.buffer_info()[1]))
# process sequences
def process_sequences():
global observed_kmers
observed_kmers = {}
for seq in seqs:
for i in range(0, len(seq)):
kmer = seq[i:i+k]
if len(kmer) == k:
observed_kmers[kmer] = None
idx = 0
for j in range(k-1, -1, -1):
idx += 4**(k-j-1) * bm[kmer[j]]
ba[idx] = True
def test_bitarray():
test_idx = 0
for j in range(k-1, -1, -1):
test_idx += 4**(k-j-1) * bm[test_kmer[j]]
test_result = ba[test_idx]
if test_result:
sys.stdout.write("%s found\n" % (test_kmer))
sys.exit(os.EX_OK)
else:
sys.stdout.write("%s not found\n" % (test_kmer))
sys.exit(os.EX_DATAERR)
def main():
global k
k = int(sys.argv[1])
global bm
bm = { 'A' : 0, 'C' : 1, 'T' : 2, 'G' : 3 }
read_sequences()
initialize_bitarray()
process_sequences()
try:
global test_kmer
test_kmer = sys.argv[2]
if len(test_kmer) == k:
test_bitarray()
else:
raise ValueError("test kmer (%s) should be of length k (%d)" % (test_kmer, k))
except IndexError as err:
keys = list(observed_kmers.keys())
for i in range(0, len(keys)):
sys.stdout.write("%s found\n" % (keys[i]))
sys.exit(os.EX_OK)
if __name__== "__main__":
main()
Note that this doesn't look at canonical kmers, e.g., AG is considered a distinct 2mer from its reverse complement CT.
To use this script, you pipe in your FASTA, specify the k, and an optional kmer that you want to test for presence/absence, e.g.:
$ echo -e ">foo\nCATTCTC\nGGGAC\n>bar\nTTATAT\n>baz\nTTTATTAG\nACCTCT" | ./kmer-bool.py 2 CG
Memory usage of bitarray.bitarray instance is [2] bytes
CG found
Or:
$ echo -e ">foo\nCATTCTC\nGGGAC\n>bar\nTTATAT\n>baz\nTTTATTAG\nACCTCT" | ./kmer-bool.py 3 AAA
Memory usage of bitarray.bitarray instance is [8] bytes
AAA not found
Or if the optional test kmer is left out:
$ echo -e ">foo\nCATTCTC\nGGGAC\n>bar\nTTATAT\n>baz\nTTTATTAG\nACCTCT" | ./kmer-bool.py 5
Memory usage of bitarray.bitarray instance is [128] bytes
CATTC found
ATTCT found
TTCTC found
TCTCG found
CTCGG found
TCGGG found
CGGGA found
GGGAC found
TTATA found
TATAT found
TTTAT found
TTATT found
TATTA found
ATTAG found
TTAGA found
TAGAC found
AGACC found
GACCT found
ACCTC found
CCTCT found
Or for the ~67M kmers in a 13-mer set, for which a roughly 8.4MB bit array is reserved:
$ echo -e ">foo\nCATTCTC\nGGGAC\n>bar\nTTATAT\n>baz\nTTTATTAG\nACCTCT" | ./kmer-bool.py 13
Memory usage of bitarray.bitarray instance is [8388608] bytes
TTTATTAGACCTC found
TTATTAGACCTCT found