Skip to content

Commit

Permalink
Added a test for the NMDC database and fixed up study ID retrieval.
Browse files Browse the repository at this point in the history
  • Loading branch information
jeff-cohere committed Nov 19, 2024
1 parent ceb42cc commit eaf6968
Show file tree
Hide file tree
Showing 3 changed files with 173 additions and 24 deletions.
3 changes: 2 additions & 1 deletion config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,8 @@ func validateEndpoints(endpoints map[string]endpointConfig) error {
Endpoint: name,
Message: "Invalid UUID",
}
} else if endpoint.Provider == "" { // no provider given
}
if endpoint.Provider == "" { // no provider given
return InvalidEndpointConfigError{
Endpoint: name,
Message: "No provider specified",
Expand Down
57 changes: 34 additions & 23 deletions databases/nmdc/database.go
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,8 @@ func (db Database) Resources(fileIds []string) ([]frictionless.DataResource, err
// add credit metadata
studyId := studyIdForDataObjectId[resources[i].Id]
resources[i].Credit = creditForStudyId[studyId]
resources[i].Credit.ResourceType = "dataset"
resources[i].Credit.Identifier = resources[i].Id
}
return resources, nil
}
Expand Down Expand Up @@ -486,11 +488,12 @@ func (db Database) studyIdsForDataObjectIds(dataObjectIds []string) (map[string]
// NOTE:
// NOTE: If we need to, we can break up our aggregate queries into smaller
// NOTE: chunks, since these queries are independent.
type MatchIdInSlice struct {
In []string `json:"$in,omitempty"`
}
type MatchOperation struct {
// matches a single record by ID
Id string `json:"id,omitempty"`
// matches a record whose ID is in the given list
In []string `json:"in,omitempty"`
// matches an ID with one of those in the given list
Id MatchIdInSlice `json:"id"`
}
type LookupOperation struct {
From string `json:"from"`
Expand Down Expand Up @@ -518,25 +521,20 @@ func (db Database) studyIdsForDataObjectIds(dataObjectIds []string) (map[string]
// match against our set of data object IDs
{
Match: &MatchOperation{
In: dataObjectIds,
Id: MatchIdInSlice{
In: dataObjectIds,
},
},
},
// look up the data object's workflow execution set
// (the study IDs for the data generation set are in
// the associated_studies field)
{
Lookup: &LookupOperation{
From: "data_generation_set",
LocalField: "was_generated_by",
ForeignField: "id",
As: "data_generation_id",
},
},
// look up the study for the data generation set
{
Lookup: &LookupOperation{
From: "study_set",
LocalField: "associated_studies",
ForeignField: "id",
As: "study_id",
As: "data_generation_sets",
},
},
},
Expand All @@ -550,16 +548,20 @@ func (db Database) studyIdsForDataObjectIds(dataObjectIds []string) (map[string]
if err != nil {
return nil, err
}
type DataObjectStudyPair struct {
DataObjectId string `json:"id"`
StudyId string `json:"study_id"`
type DataGenerationSet struct {
Id string `json:"id"`
AssociatedStudies []string `json:"associated_studies"`
}
type DataObjectAndDataGenerationSet struct {
DataObjectId string `json:"id"`
DataGenerationSets []DataGenerationSet `json:"data_generation_sets"`
}
type QueryResults struct {
Ok int `json:"ok"`
Cursor struct {
FirstBatch []DataObjectStudyPair `json:"firstBatch"`
Id int `json:"id"`
NS string `json:"ns"`
FirstBatch []DataObjectAndDataGenerationSet `json:"firstBatch"`
Id int `json:"id"`
NS string `json:"ns"`
}
}
var results QueryResults
Expand All @@ -570,8 +572,17 @@ func (db Database) studyIdsForDataObjectIds(dataObjectIds []string) (map[string]

// map each data object ID to the corresponding study ID
studyIdForDataObjectId := make(map[string]string)
for _, pair := range results.Cursor.FirstBatch {
studyIdForDataObjectId[pair.DataObjectId] = pair.StudyId
for _, record := range results.Cursor.FirstBatch {
// FIXME: for now, take the first study in the first data generation set
if len(record.DataGenerationSets) > 0 {
if len(record.DataGenerationSets[0].AssociatedStudies) > 0 {
studyIdForDataObjectId[record.DataObjectId] = record.DataGenerationSets[0].AssociatedStudies[0]
} else {
slog.Debug(fmt.Sprintf("No study is associated with the data object %s", record.DataObjectId))
}
} else {
slog.Debug(fmt.Sprintf("No data generation info was found for the data object %s", record.DataObjectId))
}
}
return studyIdForDataObjectId, err
}
Expand Down
137 changes: 137 additions & 0 deletions databases/nmdc/database_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
package nmdc

import (
"encoding/json"
"os"
"testing"

"github.com/stretchr/testify/assert"

"github.com/kbase/dts/config"
"github.com/kbase/dts/databases"
"github.com/kbase/dts/dtstest"
"github.com/kbase/dts/endpoints"
"github.com/kbase/dts/endpoints/globus"
)

const nmdcConfig string = `
databases:
nmdc:
name: National Microbiome Data Collaborative
organization: DOE
endpoints:
nersc: globus-nmdc-nersc
emsl: globus-nmdc-emsl
endpoints:
globus-nmdc-nersc:
name: NMDC (NERSC)
id: ${DTS_GLOBUS_TEST_ENDPOINT}
provider: globus
root: /
auth:
client_id: ${DTS_GLOBUS_CLIENT_ID}
client_secret: ${DTS_GLOBUS_CLIENT_SECRET}
globus-nmdc-emsl:
name: NMDC Bulk Data Cache
id: ${DTS_GLOBUS_TEST_ENDPOINT}
provider: globus
root: /
auth:
client_id: ${DTS_GLOBUS_CLIENT_ID}
client_secret: ${DTS_GLOBUS_CLIENT_SECRET}
globus-jdp:
name: Globus NERSC DTN
id: ${DTS_GLOBUS_TEST_ENDPOINT}
provider: globus
auth:
client_id: ${DTS_GLOBUS_CLIENT_ID}
client_secret: ${DTS_GLOBUS_CLIENT_SECRET}
`

// since NMDC doesn't support search queries at this time, we search for
// data objects related to a study
var nmdcSearchParams map[string]json.RawMessage

// this function gets called at the begіnning of a test session
func setup() {
dtstest.EnableDebugLogging()
config.Init([]byte(nmdcConfig))
databases.RegisterDatabase("nmdc", NewDatabase)
endpoints.RegisterEndpointProvider("globus", globus.NewEndpoint)

// construct NMDC-specific search parameters for a study
nmdcSearchParams = make(map[string]json.RawMessage)
studyId, _ := json.Marshal("nmdc:sty-11-5tgfr349")
nmdcSearchParams["study_id"] = studyId
}

// this function gets called after all tests have been run
func breakdown() {
}

func TestNewDatabase(t *testing.T) {
assert := assert.New(t)
orcid := os.Getenv("DTS_KBASE_TEST_ORCID")
db, err := NewDatabase(orcid)
assert.NotNil(db, "NMDC database not created")
assert.Nil(err, "NMDC database creation encountered an error")
}

func TestNewDatabaseWithoutOrcid(t *testing.T) {
assert := assert.New(t)
db, err := NewDatabase("")
assert.Nil(db, "Invalid NMDC database somehow created")
assert.NotNil(err, "NMDC database creation without ORCID encountered no error")
}

func TestSearch(t *testing.T) {
assert := assert.New(t)
orcid := os.Getenv("DTS_KBASE_TEST_ORCID")
db, _ := NewDatabase(orcid)

params := databases.SearchParameters{
Query: "",
Specific: nmdcSearchParams,
}
results, err := db.Search(params)
assert.True(len(results.Resources) > 0, "NMDC search query returned no results")
assert.Nil(err, "NMDC search query encountered an error")
}

func TestResources(t *testing.T) {
assert := assert.New(t)
orcid := os.Getenv("DTS_KBASE_TEST_ORCID")
db, _ := NewDatabase(orcid)
params := databases.SearchParameters{
Query: "",
Specific: nmdcSearchParams,
}
results, _ := db.Search(params)
fileIds := make([]string, len(results.Resources))
for i, res := range results.Resources {
fileIds[i] = res.Id
}
resources, err := db.Resources(fileIds[:10])
assert.Nil(err, "NMDC resource query encountered an error")
assert.Equal(10, len(resources),
"NMDC resource query didn't return requested number of results")
for i, resource := range resources {
jdpSearchResult := results.Resources[i]
assert.Equal(jdpSearchResult.Id, resource.Id, "Resource ID mismatch")
assert.Equal(jdpSearchResult.Name, resource.Name, "Resource name mismatch")
assert.Equal(jdpSearchResult.Path, resource.Path, "Resource path mismatch")
assert.Equal(jdpSearchResult.Format, resource.Format, "Resource format mismatch")
assert.Equal(jdpSearchResult.Bytes, resource.Bytes, "Resource size mismatch")
assert.Equal(jdpSearchResult.MediaType, resource.MediaType, "Resource media type mismatch")
assert.Equal(jdpSearchResult.Credit.Identifier, resource.Credit.Identifier, "Resource credit ID mismatch")
assert.Equal(jdpSearchResult.Credit.ResourceType, resource.Credit.ResourceType, "Resource credit resource type mismatch")
}
}

// this runs setup, runs all tests, and does breakdown
func TestMain(m *testing.M) {
setup()
status := m.Run()
breakdown()
os.Exit(status)
}

0 comments on commit eaf6968

Please sign in to comment.