Skip to content

Commit

Permalink
fixing some todos
Browse files Browse the repository at this point in the history
  • Loading branch information
Will Rowe committed Jul 15, 2019
1 parent 38d67fa commit 54a77d4
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 47 deletions.
13 changes: 12 additions & 1 deletion cmd/sketch.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ func runSketch() {
log.Printf("\tconcept drift: enabled\n")
log.Printf("\tdecay ratio: %.2f\n", *decayRatio)
}
spectrumSize := helpers.Pow(*kmerSize, 4)
spectrumSize := int32(helpers.Pow(*kmerSize, 4))
log.Printf("\tnumber of bins in k-mer spectrum: %d\n", spectrumSize)
// adding any additional sketches?
log.Printf("\tadding KHF sketch: %v\n", *addKHF)
Expand Down Expand Up @@ -143,6 +143,17 @@ func runSketch() {
KMV: *addKMV,
}

// add the filename(s) which is being sketched by HULK
if len(*fastq) == 0 {
hulkInfo.Sketch.FileName = "STDIN"
} else {
inputFiles := ""
for _, file := range *fastq {
inputFiles += file + ","
}
hulkInfo.Sketch.FileName = inputFiles
}

// create the pipeline
log.Printf("initialising sketching pipeline...\n")
sketchPipeline := pipeline.NewPipeline()
Expand Down
2 changes: 1 addition & 1 deletion src/bitvector/bitvector.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ func maxSize() int {
case "js":
maxSize = 64
default:
maxSize = 32 // TODO: this should be an error
maxSize = 32
}
return maxSize
}
Expand Down
28 changes: 15 additions & 13 deletions src/histosketch/histosketch.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
// I've made some changes in my implementation compared to the paper:
// - Instead of providing the number of histogram bins (Dimensions) and the number of countmin hash tables (d), I have decided to use epsilon and delta values to calculate CMS Dimensions.
// - As I am using HistoSketch to Sketch CMS counters, the Dimensions of the histosketch are determined by the CMS Dimensions

package histosketch

import (
Expand All @@ -29,8 +28,8 @@ type CWS struct {

// getSample is a method to yield A_ka from the CWS, given the incoming histogram bin and the current sketch position
func (CWS *CWS) getSample(i uint, j int, freq float64) float64 {
Y_ka := math.Exp(math.Log(freq) - CWS.b[j][i])
return CWS.c[j][i] / (Y_ka * math.Exp(CWS.r[j][i]))
Yka := math.Exp(math.Log(freq) - CWS.b[j][i])
return CWS.c[j][i] / (Yka * math.Exp(CWS.r[j][i]))
}

// HistoSketch is the histosketch data structure
Expand All @@ -41,14 +40,14 @@ type HistoSketch struct {
Sketch []uint `json:"mins"` // S in paper
SketchWeights []float64 `json:"weights"` // A in paper
SketchSize uint `json:"num"` // number of minimums in the histosketch
Dimensions uint `json:"num_histogram_bins"` // number of histogram bins
Dimensions int32 `json:"num_histogram_bins"` // number of histogram bins
ApplyConceptDrift bool `json:"concept_drift"` // if true, uniform scaling will be applied to frequency estimates (in the CMS) and a decay ratio will be applied to sketch elements prior to assessing incoming elements
cwsSamples *CWS // the consistent weighted samples
cmSketch *countmin.CountMinSketch // Q in the paper (d * g matrix, where g is Sketch length)
}

// NewHistoSketch is the constructor function
func NewHistoSketch(kmerSize, sketchLength, histogramSize uint, decayRatio float64) (*HistoSketch, error) {
func NewHistoSketch(kmerSize, histosketchLength uint, numHistogramBins int32, decayRatio float64) (*HistoSketch, error) {

// run some basic checks
if kmerSize > MAX_K {
Expand All @@ -63,15 +62,18 @@ func NewHistoSketch(kmerSize, sketchLength, histogramSize uint, decayRatio float
if !checkFloat(decayRatio) {
return nil, fmt.Errorf("decay ratio must be between 0.0 and 1.0")
}
if numHistogramBins < 2 {
return nil, fmt.Errorf("histogram must have at least 2 bins")
}

// create the histosketch data structure
newHistosketch := &HistoSketch{
algorithm: "histosketch",
KmerSize: kmerSize,
Sketch: make([]uint, sketchLength),
SketchWeights: make([]float64, sketchLength),
SketchSize: sketchLength,
Dimensions: histogramSize,
Sketch: make([]uint, histosketchLength),
SketchWeights: make([]float64, histosketchLength),
SketchSize: histosketchLength,
Dimensions: numHistogramBins,
cmSketch: countmin.NewCountMinSketch(countmin.EPSILON, countmin.DELTA, decayRatio),
}
if decayRatio != 1.0 {
Expand Down Expand Up @@ -106,7 +108,7 @@ func (HistoSketch *HistoSketch) newCWS() {
r[i] = make([]float64, HistoSketch.Dimensions)
c[i] = make([]float64, HistoSketch.Dimensions)
b[i] = make([]float64, HistoSketch.Dimensions)
for j := uint(0); j < HistoSketch.Dimensions; j++ {
for j := int32(0); j < HistoSketch.Dimensions; j++ {
r[i][j] = gammaGenerator.Gamma(2, 1)
c[i][j] = math.Log(gammaGenerator.Gamma(2, 1))
//b[i][j] = uniformGenerator.Float64Range(0, 1) // as in paper
Expand All @@ -133,7 +135,7 @@ func (HistoSketch *HistoSketch) AddElement(bin uint64, value float64) error {
for histosketchSlot := range HistoSketch.Sketch {

// get the CWS value (A_ka) for the incoming element
A_ka := HistoSketch.cwsSamples.getSample(uint(bin), histosketchSlot, estiFreq)
Aka := HistoSketch.cwsSamples.getSample(uint(bin), histosketchSlot, estiFreq)

// get the current minimum in the histosketchSlot, accounting for concept drift if requrested
var curMin float64
Expand All @@ -144,9 +146,9 @@ func (HistoSketch *HistoSketch) AddElement(bin uint64, value float64) error {
}

// if A_ka is a new minimum, replace both the bin and the weight held at this slot in the histosketch
if A_ka < curMin {
if Aka < curMin {
HistoSketch.Sketch[histosketchSlot] = uint(bin)
HistoSketch.SketchWeights[histosketchSlot] = A_ka
HistoSketch.SketchWeights[histosketchSlot] = Aka
}
}
return nil
Expand Down
49 changes: 25 additions & 24 deletions src/pipeline/pipeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,31 @@ package pipeline
// BUFFERSIZE is the size of the buffer used by the pipeline channels
const BUFFERSIZE int = 64

// Info stores the runtime information
type Info struct {
Version string
Sketch *SketchCmd
}

// SketchCmd stores the runtime info for the sketch command
type SketchCmd struct {
FileName string // this is the name of the input file(s) which has been sketched, or STDIN if -f was not provided
Fasta bool
KmerSize uint
WindowSize uint
SpectrumSize int32
SketchSize uint
ChunkSize uint
DecayRatio float64
Stream bool
Interval uint
OutFile string
NumMinions int
BannerLabel string
KHF bool
KMV bool
}

// process is the interface used by pipeline
type process interface {
Run()
Expand Down Expand Up @@ -48,27 +73,3 @@ func (Pipeline *Pipeline) Run() {
func (Pipeline *Pipeline) GetNumProcesses() int {
return len(Pipeline.processes)
}

// Info stores the runtime information
type Info struct {
Version string
Sketch *SketchCmd
}

// SketchCmd stores the runtime info for the sketch command
type SketchCmd struct {
Fasta bool
KmerSize uint
WindowSize uint
SpectrumSize uint
SketchSize uint
ChunkSize uint
DecayRatio float64
Stream bool
Interval uint
OutFile string
NumMinions int
BannerLabel string
KHF bool
KMV bool
}
1 change: 1 addition & 0 deletions src/pipeline/sketch.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ func (proc *Sketcher) Run() {
}

// add any final info to the HULKdata before writing the sketch to disk
hulkData.FileName = proc.info.Sketch.FileName
hulkData.Banner = proc.info.Sketch.BannerLabel
hulkData.WriteJSON(proc.info.Sketch.OutFile + ".json")
log.Printf("\twritten sketch to disk: %v\n", proc.info.Sketch.OutFile+".json")
Expand Down
10 changes: 2 additions & 8 deletions src/sketchio/sketchio.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,24 +75,21 @@ func (HULKdata *HULKdata) Add(inputSketch SketchObject) error {
}

// WriteJSON writes a HULKdata to a JSON file on disk
func (HULKdata *HULKdata) WriteJSON(fileName string) error {
func (HULKdata *HULKdata) WriteJSON(outfileName string) error {

// Make sure it isn't an empty sketch object
if len(HULKdata.Signatures) == 0 {
return fmt.Errorf("no sketches have been added to the JSON object yet")
}

// TODO: check the filename?
HULKdata.FileName = fileName

// Marshall it
jsonData, err := json.MarshalIndent(HULKdata, "", " ")
if err != nil {
return fmt.Errorf("error marshalling to JSON: %v", err)
}

// Write it
err = ioutil.WriteFile(HULKdata.FileName, jsonData, 0644)
err = ioutil.WriteFile(outfileName, jsonData, 0644)
if err != nil {
return fmt.Errorf("error writing JSON to file: %v", err)
}
Expand Down Expand Up @@ -171,9 +168,6 @@ func LoadHULKdata(fileName string) (*HULKdata, error) {
if len(loadedData.Signatures) < 1 {
return nil, fmt.Errorf("no signatures found in supplied file: %v\n", fileName)
}
if loadedData.FileName != fileName {
return nil, fmt.Errorf("filename mismatch: %v vs. %v\n", loadedData.FileName, fileName)
}
if loadedData.Class != "hulk_sketch" {
return nil, fmt.Errorf("JSON not created by HULK: %v\n", fileName)
}
Expand Down

0 comments on commit 54a77d4

Please sign in to comment.