Skip to content

Commit e989c4e

Browse files
usecases and bug fix (#6)
* feat: record only search Added API to return only what files contains data in the query range, reworked query data method to use this api. * showcase: to be removed. * feat: heat map output for results Added region_plot function (name might change later) that allows user to output a heatmap of the query range. * feat: refactor plotting, plot by category When provided a dictionary of file_name (path to file) and corresponding type, user can generate heatmap aggregated by types. * benchmark: option for number of files added number of files option for benchmarking a large database. * breaking: specifying multiple files Specifying particular file now takes a list as argument, supporting query with multiple specified files instead of just one. * benchmark: update due to function call change Due to the previous breaking change, updated benchmarking scripts accordingly. * feat: heatmap, hit, search bug fix refactored heatmap generation functions, changed output for hit search, and bug fix for index building
1 parent d75ce6b commit e989c4e

36 files changed

+1238
-65
lines changed

.coveragerc

100644100755
File mode changed.

.gitignore

100644100755
File mode changed.

.travis.yml

100644100755
File mode changed.

AUTHORS.rst

100644100755
File mode changed.

CHANGELOG.rst

100644100755
File mode changed.

CONTRIBUTING.rst

100644100755
File mode changed.

LICENSE.txt

100644100755
File mode changed.

README.rst

100644100755
File mode changed.

Showcase.ipynb

Lines changed: 1060 additions & 0 deletions
Large diffs are not rendered by default.

benchmarks/benchmarking.py

100644100755
Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
parser.add_argument("--query_range", dest='query_range', help="benchmark query range.", default=5000)
2323
parser.add_argument("--query_times", dest='query_times', help="benchmark query times.", default=10)
24+
parser.add_argument("--num_file", dest='num_file', help="Number of files used in benchmark.", default=10)
2425
parser.add_argument("--files_names", dest='files_names', help="benchmark file names.", default='./large_test_data/index')
2526
parser.add_argument("--files_path", dest='files_path', help="benchmark file folder path.", default='./large_test_data/')
2627
parser.add_argument("--index_path", dest='index_path', help="benchmark index path.", default='./index_data/')
@@ -36,6 +37,7 @@
3637
files_path = args.files_path
3738
index_path = args.index_path
3839
remove_index = bool(args.remove_index)
40+
num_file = int(args.num_file)
3941

4042
if os.path.exists(index_path):
4143
print("Warning: Index path is not empty. Will load index from index_path")
@@ -48,6 +50,10 @@
4850
for line in f:
4951
files.append(line.strip())
5052

53+
if num_file >0:
54+
files = random.sample(files, num_file)
55+
56+
print('number of files: ', len(files))
5157

5258
# generate queries
5359
genomes = get_genome('mm10')
@@ -68,39 +74,44 @@
6874
setup_t = time.time()-t
6975

7076
t = time.time()
71-
dfs = []
77+
7278
for chromosome, start in queries:
79+
dfs = []
7380
for bw,f in zip(bws,files):
7481
res, err = bw.getRange(chr=chromosome, start=start, end=start + query_range, zoomlvl = -2)
7582
res["file"] = f
7683
dfs.append(res)
77-
dfs = pd.concat(dfs, axis = 0)
84+
dfs = pd.concat(dfs, axis = 0)
7885
read_t = time.time()-t
7986

87+
# print(dfs)
8088
print("FileParser setup time:", setup_t)
8189
print("FileParser read time:", read_t)
8290

8391

92+
93+
8494
# # Quindex
8595

86-
t = time.time()
96+
8797
genome = get_genome('mm10')
8898
base_path=index_path
8999

100+
t = time.time()
90101
if os.path.exists(index_path):
91102
index = EpivizQuindex.EpivizQuindex(genome, base_path=base_path)
92103
index.from_disk()
93104
else:
94105
index = EpivizQuindex.EpivizQuindex(genome, base_path=base_path)
95106
for f in files:
96-
print(f)
107+
# print(f)
97108
index.add_to_index(files_path + f)
98109
index.to_disk()
99110
setup_t = time.time()-t
100111

101112
t = time.time()
102113
for chromosome, start in queries:
103-
index.query(chromosome, start, start + query_range)
114+
index.has_data(chromosome, start, start + query_range)
104115
read_t = time.time()-t
105116

106117
print("Quindex setup time:", setup_t)
@@ -111,7 +122,7 @@
111122

112123
t = time.time()
113124
for chromosome, start in queries:
114-
index.query(chromosome, start, start + query_range, in_memory = False)
125+
dfs = index.has_data(chromosome, start, start + query_range, in_memory = False, file_names = files)
115126
read_t = time.time()-t
116127

117128
print("Quindex file-based search time:", read_t)

0 commit comments

Comments
 (0)