From d3f112b5bb9c8b7ba68ae3b95c25d015a95871df Mon Sep 17 00:00:00 2001
From: "Jeffrey N. Johnson" <jeff@cohere-llc.com>
Date: Wed, 25 Sep 2024 13:58:58 -0700
Subject: [PATCH] Filled in some more gaps in the NMDC implementation.

---
 credit/metadata.go         |  52 ++-
 databases/nmdc/database.go | 702 +++++++++++++++++--------------------
 2 files changed, 346 insertions(+), 408 deletions(-)

diff --git a/credit/metadata.go b/credit/metadata.go
index 95be64f9..06883ffb 100644
--- a/credit/metadata.go
+++ b/credit/metadata.go
@@ -1,7 +1,7 @@
 package credit
 
 /*
- * Represents a contributor to the resource.
+  - Represents a contributor to the resource.
 
 Contributors must have a 'contributor_type', either 'Person' or 'Organization', and
 one of the 'name' fields: either 'given_name' and 'family_name' (for a person), or 'name' (for an organization or a person).
@@ -13,8 +13,7 @@ appropriate roles, please see the following links:
 DataCite contributor roles: https://support.datacite.org/docs/datacite-metadata-schema-v44-recommended-and-optional-properties#7a-contributortype
 
 CRediT contributor role taxonomy: https://credit.niso.org
-
- */
+*/
 type Contributor struct {
 	/*
 	 * Must be either 'Person' or 'Organization'
@@ -47,7 +46,7 @@ type Contributor struct {
 }
 
 /*
- * Represents the credit metadata associated with an object.
+  - Represents the credit metadata associated with an object.
 
 In the following documentation, 'Resource' is used to refer to the object
 that the CM pertains to, for example, a KBase Workspace object or a
@@ -76,8 +75,7 @@ the last update (if applicable).
 
 The resource_type field is required, but as there is currently only a single valid
 value, 'dataset', it is automatically populated if no value is supplied.
-
- */
+*/
 type CreditMetadata struct {
 	/*
 	 * List of strings of freeform text providing extra information about this credit metadata.
@@ -112,11 +110,11 @@ type CreditMetadata struct {
 	 */
 	Identifier string `json:"identifier"`
 	/*
-	 * Usage license for the resource. Use one of the SPDX license identifiers or provide a link to the license text if no SPDX ID is available.
+			 * Usage license for the resource. Use one of the SPDX license identifiers or provide a link to the license text if no SPDX ID is available.
 
-All data published at KBase is done so under a Creative Commons 0 or Creative Commons 4.0 license.
+		All data published at KBase is done so under a Creative Commons 0 or Creative Commons 4.0 license.
 
-	 */
+	*/
 	License License `json:"license"`
 	/*
 	 * Metadata for this credit information, including submitter, schema version, and timestamp.
@@ -167,11 +165,10 @@ type Description struct {
 }
 
 /*
- * Represents an event in the lifecycle of a resource and the date it occurred on.
+  - Represents an event in the lifecycle of a resource and the date it occurred on.
 
 See https://support.datacite.org/docs/datacite-metadata-schema-v44-recommended-and-optional-properties#8-date for more information on the events.
-
- */
+*/
 type EventDate struct {
 	/*
 	 * The date associated with the event. The date may be in the format YYYY, YYYY-MM, or YYYY-MM-DD.
@@ -184,7 +181,7 @@ type EventDate struct {
 }
 
 /*
- * Represents a funding source for a resource, including the funding body and the grant awarded.
+  - Represents a funding source for a resource, including the funding body and the grant awarded.
 
 The 'funder_name' field is required; all others are optional.
 
@@ -194,8 +191,7 @@ Recommended resources for organization identifiers include:
   - Crossref Funder Registry, https://www.crossref.org/services/funder-registry/ (to be subsumed into ROR)
 
 Some organizations may have a digital object identifier (DOI).
-
- */
+*/
 type FundingReference struct {
 	/*
 	 * Code for the grant, assigned by the funder
@@ -248,7 +244,7 @@ type Metadata struct {
 }
 
 /*
- * Represents an organization.
+  - Represents an organization.
 
 Recommended resources for organization identifiers and canonical organization names include:
   - Research Organization Registry, http://ror.org
@@ -256,10 +252,10 @@ Recommended resources for organization identifiers and canonical organization na
   - Crossref Funder Registry, https://www.crossref.org/services/funder-registry/
 
 For example, the US DOE would be entered as:
-  organization_name: United States Department of Energy
-  organization_id:   ROR:01bj3aw27
 
- */
+	organization_name: United States Department of Energy
+	organization_id:   ROR:01bj3aw27
+*/
 type Organization struct {
 	/*
 	 * Persistent unique identifier for the organization in the format <database name>:<identifier within database>
@@ -272,7 +268,7 @@ type Organization struct {
 }
 
 /*
- * Represents a persistent unique identifier for an entity, with an optional relationship to some other entity.
+  - Represents a persistent unique identifier for an entity, with an optional relationship to some other entity.
 
 The 'id' field and 'relationship_type' fields are required.
 
@@ -281,8 +277,7 @@ The values in the 'relationship_type' field come from controlled vocabularies ma
 DataCite relation types: https://support.datacite.org/docs/datacite-metadata-schema-v44-recommended-and-optional-properties#12b-relationtype
 
 Crossref relation types: https://www.crossref.org/documentation/schema-library/markup-guide-metadata-segments/relationships/
-
- */
+*/
 type PermanentID struct {
 	/*
 	 * Persistent unique ID for an entity. Should be in the format <database name>:<identifier within database>.
@@ -293,19 +288,18 @@ type PermanentID struct {
 	 */
 	Description string `json:"description"`
 	/*
-	 * The relationship between the ID and some other entity.
-For example, when a PermanentID class is used to represent objects in the CreditMetadata field 'related_identifiers', the 'relationship_type' field captures the relationship between the CreditMetadata and this ID.
+			 * The relationship between the ID and some other entity.
+		For example, when a PermanentID class is used to represent objects in the CreditMetadata field 'related_identifiers', the 'relationship_type' field captures the relationship between the CreditMetadata and this ID.
 
-	 */
+	*/
 	RelationshipType string `json:"relationship_type"`
 }
 
 /*
- * Represents the title or name of a resource, the type of that title, and the language used (if appropriate).
+  - Represents the title or name of a resource, the type of that title, and the language used (if appropriate).
 
 The 'title' field is required; 'title_type' is only necessary if the text is not the primary title.
-
- */
+*/
 type Title struct {
 	/*
 	 * The language in which the title is written, using the appropriate IETF BCP-47 notation.
@@ -320,5 +314,3 @@ type Title struct {
 	 */
 	TitleType string `json:"title_type"`
 }
-
-
diff --git a/databases/nmdc/database.go b/databases/nmdc/database.go
index 88c8d90f..c39f17e8 100644
--- a/databases/nmdc/database.go
+++ b/databases/nmdc/database.go
@@ -46,8 +46,8 @@ import (
 )
 
 const (
-	nmdcBaseURL     = "https://api.microbiomedata.org"
-	filePathPrefix = "/data/" // path exposing NMDC files available via Globus
+	baseApiURL  = "https://api.microbiomedata.org/"
+	baseDataURL = "https://data.microbiomedata.org/data/"
 )
 
 // this error type is returned when a file is requested for which the requester
@@ -69,153 +69,88 @@ func (e FileIdNotFoundError) Error() string {
 	return fmt.Sprintf("Can't access file %s: not found.", e.fileId)
 }
 
-// a mapping from file suffixes to format labels
-var suffixToFormat = map[string]string{
-	"bam":      "bam",
-	"bam.bai":  "bai",
-	"blasttab": "blast",
-	"bz":       "bzip",
-	"bz2":      "bzip2",
-	"csv":      "csv",
-	"faa":      "fasta",
-	"fasta":    "fasta",
-	"fasta.gz": "fasta",
-	"fastq":    "fastq",
-	"fastq.gz": "fastq",
-	"fna":      "fasta",
-	"gff":      "gff",
-	"gff3":     "gff3",
-	"gz":       "gz",
-	"html":     "html",
-	"info":     "texinfo",
-	"out":      "text",
-	"pdf":      "pdf",
-	"tar":      "tar",
-	"tar.gz":   "tar",
-	"tar.bz":   "tar",
-	"tar.bz2":  "tar",
-	"tsv":      "tsv",
-	"txt":      "text",
+// a mapping from NMDC file types to format labels
+// (see https://microbiomedata.github.io/nmdc-schema/FileTypeEnum/)
+var fileTypeToFormat = map[string]string{
+	"Annotation Amino Acid FASTA":  "fasta",
+	"Annotation Enzyme Commission": "tsv",
+	"Annotation KEGG Orthology":    "tsv",
+	"Assembly AGP":                 "agp",
+	"Assembly Contigs":             "fasta",
+	"Assembly Coverage BAM":        "bam",
+	"Assembly Info File":           "texinfo",
+	"Assembly Scaffolds":           "fasta",
+	"BAI File":                     "bai",
+	"CATH FunFams (Functional Families) Annotation GFF": "gff3",
+	"Centrifuge Krona Plot":                             "html",
+	"Clusters of Orthologous Groups (COG) Annotation GFF", "gff3",
+	"CRT Annotation GFF":                 "gff3",
+	"Direct Infusion FT ICR-MS Raw Data": "raw",
+	"Error Corrected Reads":              "fastq",
+	"Filtered Sequencing Reads":          "fastq",
+	"Functional Annotation GFF":          "gff3",
+	"Genemark Annotation GFF":            "gff3",
+	"Gene Phylogeny tsv":                 "tsv",
+	"GOTTCHA2 Krona Plot":                "html",
+	"KO_EC Annotation GFF":               "gff3",
+	"Kraken2 Krona Plot":                 "html",
+	"LC-DDA-MS/MS Raw Data":              "raw",
+	"Metagenome Bins":                    "fasta",
+	"Metagenome Raw Reads":               "raw",
+	"Metagenome Raw Read 1":              "raw",
+	"Metagenome Raw Read 2":              "raw",
+	"Misc Annotation GFF":                "gff3",
+	"Pfam Annotation GFF":                "gff3",
+	"Prodigal Annotation GFF":            "gff3",
+	"QC non-rRNA R1":                     "fastq",
+	"QC non-rRNA R2":                     "fastq",
+	"Read Count and RPKM":                "json",
+	"RFAM Annotation GFF":                "gff3",
+	"Scaffold Lineage tsv":               "tsv",
+	"Structural Annotation GFF":          "gff3",
+	"Structural Annotation Stats Json":   "json",
+	"SUPERFam Annotation GFF":            "gff3",
+	"SMART Annotation GFF":               "gff3",
+	"TIGRFam Annotation GFF":             "gff3",
+	"TMRNA Annotation GFF":               "gff3",
+	"TRNA Annotation GFF":                "gff3",
 }
 
-// this gets populated automatically with the keys in suffixToFormat
-var supportedSuffixes []string
-
 // a mapping from file format labels to mime types
 var formatToMimeType = map[string]string{
-	"bam":   "application/octet-stream",
-	"bai":   "application/octet-stream",
-	"csv":   "text/csv",
-	"fasta": "text/plain",
-	"fastq": "text/plain",
-	"gff":   "text/plain",
-	"gff3":  "text/plain",
-	"gz":    "application/gzip",
-	"bz":    "application/x-bzip",
-	"bz2":   "application/x-bzip2",
-	"tar":   "application/x-tar",
-	"text":  "text/plain",
-}
-
-// a mapping from the JDP's reported file types to mime types
-// (backup method for determining mime types)
-var fileTypeToMimeType = map[string]string{
-	"text":     "text/plain",
-	"fasta":    "text/plain",
-	"fasta.gz": "application/gzip",
-	"fastq":    "text/plain",
-	"fastq.gz": "application/gzip",
-	"tab":      "text/plain",
-	"tar.gz":   "application/x-tar",
-	"tar.bz":   "application/x-tar",
-	"tar.bz2":  "application/x-tar",
-}
-
-// attributes (slots) associated with NDMC data types
-// (see https://microbiomedata.github.io/nmdc-schema/)
-var nmdcDataAttributes = []string {
-  // this list is giant--hopefully there's a way to programmatically
-  // query NMDC for this
+	"agp":     "application/octet-stream",
+	"bam":     "application/octet-stream",
+	"bai":     "application/octet-stream",
+	"csv":     "text/csv",
+	"fasta":   "text/plain",
+	"fastq":   "text/plain",
+	"gff":     "text/plain",
+	"gff3":    "text/plain",
+	"gz":      "application/gzip",
+	"bz":      "application/x-bzip",
+	"bz2":     "application/x-bzip2",
+	"json":    "application/json",
+	"raw":     "application/octet-stream",
+	"tar":     "application/x-tar",
+	"text":    "text/plain",
+	"texinfo": "text/plain",
+	"tsv":     "text/plain",
 }
 
 // extracts the file format from the name and type of the file
-func formatFromFileName(fileName string) string {
-	// make a list of the supported suffixes if we haven't yet
-	if supportedSuffixes == nil {
-		supportedSuffixes = make([]string, 0)
-		for suffix := range suffixToFormat {
-			supportedSuffixes = append(supportedSuffixes, suffix)
-		}
-	}
-
-	// determine whether the file matches any of the supported suffixes,
-	// selecting the longest matching suffix
-	format := "unknown"
-	longestSuffix := 0
-	for _, suffix := range supportedSuffixes {
-		if strings.HasSuffix(fileName, suffix) && len(suffix) > longestSuffix {
-			format = suffixToFormat[suffix]
-			longestSuffix = len(suffix)
-		}
+func formatFromType(fileType string) string {
+	if format, found := fileTypeToFormat[fileType]; found {
+		return format
 	}
-	return format
+	return "unknown"
 }
 
 // extracts the file format from the name and type of the file
-func mimeTypeFromFormatAndTypes(format string, fileTypes []string) string {
-	// try to match the file type to a mime type
-	for _, fileType := range fileTypes {
-		if mimeType, ok := fileTypeToMimeType[fileType]; ok {
-			return mimeType
-		}
-	}
-	// check the file format to see whether it matches a mime type
+func mimeTypeFromFormat(format string) string {
 	if mimeType, ok := formatToMimeType[format]; ok {
 		return mimeType
 	}
-	return ""
-}
-
-// extracts file type information from the given File
-func fileTypesFromFile(file File) []string {
-	// TODO: See https://pkg.go.dev/encoding/json?utm_source=godoc#example-RawMessage-Unmarshal
-	// TODO: for an example of how to unmarshal a variant type.
-	return []string{}
-}
-
-// extracts source information from the given metadata
-func sourcesFromMetadata(md Metadata) []frictionless.DataSource {
-	sources := make([]frictionless.DataSource, 0)
-	piInfo := md.Proposal.PI
-	if len(piInfo.LastName) > 0 {
-		var title string
-		if len(piInfo.FirstName) > 0 {
-			title = fmt.Sprintf("%s, %s", piInfo.LastName, piInfo.FirstName)
-			if len(piInfo.MiddleName) > 0 {
-				title += fmt.Sprintf(" %s", piInfo.MiddleName)
-			}
-			if len(piInfo.Institution) > 0 {
-				if len(piInfo.Country) > 0 {
-					title += fmt.Sprintf(" (%s, %s)", piInfo.Institution, piInfo.Country)
-				} else {
-					title += fmt.Sprintf(" (%s)", piInfo.Institution)
-				}
-			} else if len(piInfo.Country) > 0 {
-				title += fmt.Sprintf(" (%s)", piInfo.Country)
-			}
-		}
-		var doiURL string
-		if len(md.Proposal.AwardDOI) > 0 {
-			doiURL = fmt.Sprintf("https://doi.org/%s", md.Proposal.AwardDOI)
-		}
-		source := frictionless.DataSource{
-			Title: title,
-			Path:  doiURL,
-			Email: piInfo.EmailAddress,
-		}
-		sources = append(sources, source)
-	}
-	return sources
+	return "application/octet-stream"
 }
 
 // creates a Frictionless DataResource-savvy name for a file:
@@ -281,9 +216,9 @@ func NewDatabase(orcid string) (databases.Database, error) {
 	}
 
 	return &Database{
-		Id:         "nmdc",
-		Orcid:      orcid,
-		Secret:     secret,
+		Id:     "nmdc",
+		Orcid:  orcid,
+		Secret: secret,
 	}, nil
 }
 
@@ -293,7 +228,7 @@ func (db Database) addAuthHeader(request *http.Request) {
 }
 
 // performs a GET request on the given resource, returning the resulting
-// response and error
+// response body and/or error
 func (db *Database) get(resource string, values url.Values) (*http.Response, error) {
 	var u *url.URL
 	u, err := url.ParseRequestURI(jdpBaseURL)
@@ -309,11 +244,17 @@ func (db *Database) get(resource string, values url.Values) (*http.Response, err
 		return nil, err
 	}
 	db.addAuthHeader(req)
-	return db.Client.Do(req)
+	resp, err := db.Client.Do(req)
+	if err != nil {
+		return results, err
+	}
+	defer resp.Body.Close()
+	var body []byte
+	return io.ReadAll(resp.Body)
 }
 
 // performs a POST request on the given resource, returning the resulting
-// response and error
+// response body and/or error
 func (db *Database) post(resource string, body io.Reader) (*http.Response, error) {
 	u, err := url.ParseRequestURI(jdpBaseURL)
 	if err != nil {
@@ -328,95 +269,46 @@ func (db *Database) post(resource string, body io.Reader) (*http.Response, error
 	}
 	db.addAuthHeader(req)
 	req.Header.Set("Content-Type", "application/json")
-	return db.Client.Do(req)
+	resp, err := db.Client.Do(req)
+	if err != nil {
+		return results, err
+	}
+	defer resp.Body.Close()
+	var body []byte
+	return io.ReadAll(resp.Body)
 }
 
 // data object type for JSON marshalling
 // (see https://microbiomedata.github.io/nmdc-schema/DataObject/)
 type DataObject struct {
-  FileSizeBytes int `json:"file_size_bytes"`
-  MD5Checksum string `json:"md5_checksum"`
-  DataObjectType string `json:"data_object_type"`
-  CompressionType string `json:"compression_type"`
-  // NOTE: no representation of was_generated_by (abstract type) at the moment
-  URL string `json:"url"`
-  Type string `json:"type"`
-  Id string `json:"id"`
-  Name string `json:"name"`
-  Description string `json:"description"`
-  AlternativeIdentifiers []string `json:"alternative_identifiers,omitempty"`
+	FileSizeBytes   int    `json:"file_size_bytes"`
+	MD5Checksum     string `json:"md5_checksum"`
+	DataObjectType  string `json:"data_object_type"`
+	CompressionType string `json:"compression_type"`
+	// NOTE: no representation of was_generated_by (abstract type) at the moment
+	URL                    string   `json:"url"`
+	Type                   string   `json:"type"`
+	Id                     string   `json:"id"`
+	Name                   string   `json:"name"`
+	Description            string   `json:"description"`
+	AlternativeIdentifiers []string `json:"alternative_identifiers,omitempty"`
 }
 
-func dataResourceFromDataObject(dataObject DataObject) frictionless.DataResource {
-  format := formatFromType(dataObject.Type)
+func (db *Database) dataResourceFromDataObject(dataObject DataObject) frictionless.DataResource {
 	return frictionless.DataResource{
-		Id:        id,
-		Name:      dataResourceName(dataObject.Name),
-    Description: dataObject.Description,
-		Path:      dataObject.Name,
-		Format:    format,
-		MediaType: mimeTypeFromFormatAndTypes(format, fileTypes),
-		Bytes:     dataObject.FileSizeBytes,
-		Hash:      dataObject.MD5Checksum,
-		Credit: credit.CreditMetadata{
-			Identifier:   id,
-			ResourceType: "dataset",
-			Titles: []credit.Title{
-				{
-					Title: filePath,
-				},
-			},
-			Dates: []credit.EventDate{
-				{
-					Date:  file.Date,
-					Event: "Created",
-				},
-				{
-					Date:  file.AddedDate,
-					Event: "Accepted",
-				},
-				{
-					Date:  file.ModifiedDate,
-					Event: "Updated",
-				},
-			},
-			Publisher: credit.Organization{
-				OrganizationId:   "ROR:04xm1d337",
-				OrganizationName: "Joint Genome Institute",
-			},
-			RelatedIdentifiers: []credit.PermanentID{
-				{
-					Id:               file.Metadata.Proposal.DOI,
-					Description:      "Proposal DOI",
-					RelationshipType: "IsCitedBy",
-				},
-				{
-					Id:               file.Metadata.Proposal.AwardDOI,
-					Description:      "Awarded proposal DOI",
-					RelationshipType: "IsCitedBy",
-				},
-			},
-			Contributors: []credit.Contributor{
-				{
-					ContributorType: "Person",
-					// ContributorId: nothing yet
-					Name:       strings.TrimSpace(fmt.Sprintf("%s, %s %s", pi.LastName, pi.FirstName, pi.MiddleName)),
-					GivenName:  strings.TrimSpace(fmt.Sprintf("%s %s", pi.FirstName, pi.MiddleName)),
-					FamilyName: strings.TrimSpace(pi.LastName),
-					Affiliations: []credit.Organization{
-						{
-							OrganizationName: pi.Institution,
-						},
-					},
-					ContributorRoles: "PI",
-				},
-			},
-			Version: file.Date,
-		},
+		Id:          id,
+		Name:        dataResourceName(dataObject.Name),
+		Description: dataObject.Description,
+		Path:        filepath.Join(db.Endpoint().Root(), strings.Replace(dataObject.URL, baseDataURL, "")),
+		Format:      formatFromType(dataObject.Type),
+		MediaType:   mimeTypeFromFormat(formatFromType(dataObject.Type)),
+		Bytes:       dataObject.FileSizeBytes,
+		Hash:        dataObject.MD5Checksum,
 	}
 }
 
-// fetches metadata for data objects based on the given parameters
+// fetches metadata for data objects (no credit metadata, alas) based on the
+// given URL search parameters
 func (db *Database) dataObjects(params url.Values) (databases.SearchResults, error) {
 	var results databases.SearchResults
 
@@ -429,50 +321,155 @@ func (db *Database) dataObjects(params url.Values) (databases.SearchResults, err
 		params.Del("extra")
 	}
 
-	resp, err := db.get("data_objects/", params)
-	if err != nil {
-		return results, err
-	}
-	defer resp.Body.Close()
-	var body []byte
-	body, err = io.ReadAll(resp.Body)
-	if err != nil {
-		return results, err
-	}
+	body, err := db.get("data_objects/", params)
 	type DataObjectResults struct {
-    // NOTE: we only extract the results field for now
-    Results []DataObject `json:"results"`
+		// NOTE: we only extract the results field for now
+		Results []DataObject `json:"results"`
 	}
 	var dataObjectResults DataObjectResults
 	err = json.Unmarshal(body, &dataObjectResults)
 	if err != nil {
 		return results, err
 	}
-  results.Resources = make([]frictionless.DataResource, len(dataObjectResults.Results))
-  for i, dataObject := range dataObjectResults.Results {
-    results.Resources[i] = dataResourcesFromDataObjects(dataObject)
-  }
+
+	// fetch credit metadata (FIXME: no way to do this currently)
+	var creditMetadata credit.Metadata
+
+	results.Resources = make([]frictionless.DataResource, len(dataObjectResults.Results))
+	for i, dataObject := range dataObjectResults.Results {
+		results.Resources[i] = dataResourceFromDataObject(dataObject)
+		results.Resources[i].Credit = creditMetadata
+		results.Resources[i].Credit.Identifier = resources[i].Id
+		// FIXME: we can probably chase down credit metadata dates using the
+		// FIXME: generated_by (Activity) field, instantiated as one of the
+		// FIXME: concrete types listed here: https://microbiomedata.github.io/nmdc-schema/WorkflowExecutionActivity/
+	}
+
 	return results, nil
 }
 
-// fetches metadata for data objects associated with the given study
-func (db *Database) dataObjectsForStudy(studyId string) (databases.SearchResults, error) {
-	var results databases.SearchResults
-
-	resp, err := db.get(fmt.Sprintf("data_objects/study/%s", studyId), url.Values{})
+// fetches credit metadata for the study with the given ID
+func (db *Database) creditMetadataForStudy(studyId string) (credit.Metadata, error) {
+	// vvv credit-related NMDC schema types vvv
+
+	// https://microbiomedata.github.io/nmdc-schema/CreditAssociation/
+	type CreditAssociation struct {
+		AppliedRoles    []string    `json:"applied_roles"`
+		AppliesToPerson PersonValue `json:"applies_to_person"`
+		Type            string      `json:"type,omitempty"`
+	}
+
+	// https://microbiomedata.github.io/nmdc-schema/Doi/
+	type Doi struct {
+		Value    string `json:"doi_value"`
+		Provider string `json:"doi_provider,omitempty"`
+		Category string `json:"doi_category"`
+	}
+
+	// https://microbiomedata.github.io/nmdc-schema/PersonValue/
+	type PersonValue struct {
+		Email    string   `json:"email,omitempty"`
+		Name     string   `json:"name,omitempty"`
+		Orcid    string   `json:"orcid,omitempty"`
+		Websites []string `json:"websites,omitempty"`
+		RawValue string   `json:"has_raw_value,omitempty"` // name in 'FIRST LAST' format (if present)
+	}
+
+	// https://microbiomedata.github.io/nmdc-schema/Study/
+	type Study struct { // partial representation, includes only relevant fields
+		Id                    string              `json:"id"`
+		AlternativeNames      []string            `json:"alternative_names,omitempty"`
+		AlternativeTitles     []string            `json:"alternative_titles,omitempty"`
+		AssociatedDois        []Doi               `json:"associated_dois,omitempty"`
+		Description           string              `json:"description,omitempty"`
+		FundingSources        []string            `json:"funding_sources,omitempty"`
+		HasCreditAssociations []CreditAssociation `json:"has_credit_associations,omitempty"`
+		Name                  string              `json:"name,omitempty"`
+		PrincipalInvestigator PersonValue         `json:"principal_investigator,omitempty"`
+		RelatedIdentifiers    string              `json:"related_identifiers,omitempty"`
+		Title                 string              `json:"title,omitempty"`
+	}
+
+	// fetch the study with the given ID
+	var creditMetadata credit.Metadata
+	body, err := db.get(fmt.Sprintf("studies/%s", studyId), url.Values{})
 	if err != nil {
-		return results, err
+		return creditMetadata, err
 	}
-	defer resp.Body.Close()
-	var body []byte
-	body, err = io.ReadAll(resp.Body)
+	var study Study
+	err = json.Unmarshal(body, &study)
 	if err != nil {
-		return results, err
+		return creditMetadata, err
+	}
+
+	// fish metadata out of the study
+	contributors := []credit.Contributor{
+		{
+			ContributorType:  "Person",
+			Name:             study.PrincipalInvestigator.Name,
+			ContributorRoles: "PI",
+		},
+	}
+	if study.PrincipalInvestigator.RawValue != "" {
+		names := strings.Split(study.PrincipalInvestigator.RawValue)
+		contributors[0].GivenName = names[0]
+		contributors[0].FamilyName = names[1]
+	}
+
+	var titles []credit.Title
+	if study.Title != "" {
+		titles = make([]credit.Title, len(study.AlternativeTitles)+1)
+		titles[0].Title = study.Title
+		for i, title := range titles {
+			titles[i+1].Title = title
+		}
+	}
+
+	var relatedIdentifiers []credit.PermanentID
+	if len(study.AssociatedDois) > 0 {
+		relatedIdentifiers = make([]credit.PermanentID, len(study.AssociatedDois))
+		for i, doi := range study.AssociatedDois {
+			relatedIdentifiers[i] = credit.PermanentID{
+				Id:               doi.Value,
+				RelationshipType: "IsCitedBy",
+			}
+			switch doi.Category {
+			case "award_doi":
+				relatedIdentifiers[i].Description = "Awarded proposal DOI"
+			case "dataset_doi":
+				relatedIdentifiers[i].Description = "Dataset DOI"
+			case "publication_doi":
+				relatedIdentifiers[i].Description = "Publication DOI"
+			case "data_management_plan_doi":
+				relatedIdentifiers[i].Description = "Data management plan DOI"
+			}
+		}
 	}
 
+	creditMetadata = credit.Metadata{
+		// Identifier, Dates, and Version fields are specific to DataResources, omitted here
+		ResourceType: "dataset",
+		Titles:       titles,
+		Publisher: credit.Organization{
+			OrganizationId:   "ROR:05cwx3318",
+			OrganizationName: "National Microbiome Data Collaborative",
+		},
+		RelatedIdentifiers: relatedIdentifiers,
+		Contributors:       contributors,
+	}
+
+	return creditMetadata, err
+}
+
+// fetches file metadata for data objects associated with the given study
+func (db *Database) dataObjectsForStudy(studyId string) (databases.SearchResults, error) {
+	var results databases.SearchResults
+
+	body, err := db.get(fmt.Sprintf("data_objects/study/%s", studyId), url.Values{})
+
 	type DataObjectsByStudyResults struct {
-    BiosampleId string `json:"biosample_id"`
-    DataObjectSet []string `json:"data_object_set"`
+		BiosampleId   string   `json:"biosample_id"`
+		DataObjectSet []string `json:"data_object_set"`
 	}
 	var results []DataObjectsByStudyResults
 	err = json.Unmarshal(body, &results)
@@ -480,15 +477,27 @@ func (db *Database) dataObjectsForStudy(studyId string) (databases.SearchResults
 		return results, err
 	}
 
-  // gather all the data object IDs into a single list (they should already have
-  // an "nmdc:" CURIE prefix) and fetch their metadata
-  dataObjectIds := make([]string, 0)
-  for _, result := range results {
-    for _, dataObjectId := range result.DataObjectSet {
-      dataObjectIds.append(dataObjectId)
-    }
-  }
-  return db.Resources(dataObjectIds), nil
+	// gather all the data object IDs into a single list (they should already have
+	// an "nmdc:" CURIE prefix) and fetch their metadata
+	dataObjectIds := make([]string, 0)
+	for _, result := range results {
+		for _, dataObjectId := range result.DataObjectSet {
+			dataObjectIds.append(dataObjectId)
+		}
+	}
+	resources := db.Resources(dataObjectIds)
+
+	// add the credit metadata to each resource
+	creditMetadata := creditMetadataFromStudy(studyId)
+	for i, _ := range resources {
+		resources[i].Credit = creditMetadata
+		resources[i].Credit.Identifier = resources[i].Id
+		// FIXME: we can probably chase down credit metadata dates using the
+		// FIXME: generated_by (Activity) field, instantiated as one of the
+		// FIXME: concrete types listed here: https://microbiomedata.github.io/nmdc-schema/WorkflowExecutionActivity/
+	}
+
+	return resources, nil
 }
 
 // returns the page number and page size corresponding to the given Pagination
@@ -513,17 +522,17 @@ func pageNumberAndSize(offset, maxNum int) (int, int) {
 }
 
 func (db Database) SpecificSearchParameters() map[string]interface{} {
-  // for details about NMDC-specific search parameters, see
-  // https://api.microbiomedata.org/docs#/find:~:text=Find%20NMDC-,metadata,-entities.
+	// for details about NMDC-specific search parameters, see
+	// https://api.microbiomedata.org/docs#/find:~:text=Find%20NMDC-,metadata,-entities.
 	return map[string]interface{}{
-    "activity_id": "",
-    "data_object_id": "",
-    "fields": "",
-    "filter": "",
-    "sort": "",
-		"sample_id": "",
-		"study_id": "",
-		"extra": "",
+		"activity_id":    "",
+		"data_object_id": "",
+		"fields":         "",
+		"filter":         "",
+		"sort":           "",
+		"sample_id":      "",
+		"study_id":       "",
+		"extra":          "",
 	}
 }
 
@@ -532,18 +541,17 @@ func (db Database) addSpecificSearchParameters(params map[string]json.RawMessage
 	paramSpec := db.SpecificSearchParameters()
 	for name, jsonValue := range params {
 		switch name {
-    case "activity_id", "data_object_id", "filter", "sort", "sample_id",
-         "study_id":
-  		var value string
-	  	err := json.Unmarshal(jsonValue, &value)
-  		if err != nil {
-	  		return &databases.InvalidSearchParameter{
-		 			Database: "nmdc",
-           Message:  fmt.Sprintf("Invalid value for parameter %s (must be string)", name)
-  			}
-       }
-		}
-    case "fields": // accepts comma-delimited strings
+		case "activity_id", "data_object_id", "filter", "sort", "sample_id",
+			"study_id":
+			var value string
+			err := json.Unmarshal(jsonValue, &value)
+			if err != nil {
+				return &databases.InvalidSearchParameter{
+					Database: "nmdc",
+					Message:  fmt.Sprintf("Invalid value for parameter %s (must be string)", name),
+				}
+			}
+		case "fields": // accepts comma-delimited strings
 			var value string
 			err := json.Unmarshal(jsonValue, &value)
 			if err != nil {
@@ -561,30 +569,30 @@ func (db Database) addSpecificSearchParameters(params map[string]json.RawMessage
 					Message:  fmt.Sprintf("Invalid requested extra field: %s", value),
 				}
 			}
-    case "extra": // accepts comma-delimited strings
+		case "extra": // accepts comma-delimited strings
 		default:
 			return &databases.InvalidSearchParameter{
 				Database: "nmdc",
 				Message:  fmt.Sprintf("Unrecognized NMDC-specific search parameter: %s", name),
 			}
-    }
-  p.Add(name, value)
+		}
+	}
 	return nil
 }
 
 func (db *Database) Search(params databases.SearchParameters) (databases.SearchResults, error) {
 	p := url.Values{}
 
-  // fetch pagination parameters
+	// fetch pagination parameters
 	pageNumber, pageSize := pageNumberAndSize(params.Pagination.Offset, params.Pagination.MaxNum)
 	p.Add("page", strconv.Itoa(pageNumber))
 	p.Add("per_page", strconv.Itoa(pageSize))
 
-  // NMDC's "search" parameter is not yet implemented, so we ignore it for now
-  // in favor of "filter"
+	// NMDC's "search" parameter is not yet implemented, so we ignore it for now
+	// in favor of "filter"
 	//p.Add("search", params.Query)
 
-  // add any NMDC-specific search parameters
+	// add any NMDC-specific search parameters
 	if params.Specific != nil {
 		err := db.addSpecificSearchParameters(params.Specific, &p)
 		if err != nil {
@@ -592,110 +600,48 @@ func (db *Database) Search(params databases.SearchParameters) (databases.SearchR
 		}
 	}
 
-  // dispatch the search to the proper endpoint, depending on whether we're
-  // looking for a study or individual data objects
-  if p.Has("study_id") {
-    return db.dataObjectsForStudy(p.Get("study_id"))
-  } else {
-    // simply call the data_objects/ endpoint with the given query string
-    p.Add("filter", params.Query) // FIXME: 
-    return db.dataObjects(p)
-  }
-}
-
-func (db *Database) Resources(dataObjectIds []string) ([]frictionless.DataResource, error) {
-	type MetadataRequest struct {
-		Ids                []string `json:"ids"`
-		Aggregations       bool     `json:"aggregations"`
-		IncludePrivateData bool     `json:"include_private_data"`
-	}
-	data, err := json.Marshal(MetadataRequest{
-		Ids:                strippedFileIds,
-		Aggregations:       false,
-		IncludePrivateData: true,
-	})
-	if err != nil {
-		return nil, err
-	}
-
-	resp, err := db.post("search/by_file_ids/", bytes.NewReader(data))
-	defer resp.Body.Close()
-	var body []byte
-	body, err = io.ReadAll(resp.Body)
-	if err != nil {
-		return nil, err
-	}
-	type MetadataResponse struct {
-		Hits struct {
-			Hits []struct {
-				Type   string `json:"_type"`
-				Id     string `json:"_id"`
-				Source struct {
-					Date         string `json:"file_date"`
-					AddedDate    string `json:"added_date"`
-					ModifiedDate string `json:"modified_date"`
-					FilePath     string `json:"file_path"`
-					FileName     string `json:"file_name"`
-					FileSize     int    `json:"file_size"`
-					MD5Sum       string `json:"md5sum"`
-					Metadata     Metadata
-				} `json:"_source"`
-			} `json:"hits"`
-		} `json:"hits"`
-	}
-	var jdpResp MetadataResponse
-	err = json.Unmarshal(body, &jdpResp)
-	if err != nil {
-		return nil, err
+	// dispatch the search to the proper endpoint, depending on whether we're
+	// looking for a study or individual data objects
+	if p.Has("study_id") {
+		return db.dataObjectsForStudy(p.Get("study_id"))
+	} else {
+		// simply call the data_objects/ endpoint with the given query string
+		p.Add("search", params.Query) // FIXME: not yet supported by NMDC!
+		return db.dataObjects(p)
 	}
+}
 
-	// translate the response
-	resources := make([]frictionless.DataResource, len(strippedFileIds))
-	for i, md := range jdpResp.Hits.Hits {
-		if md.Id == "" { // permissions problem
-			return nil, &PermissionDeniedError{fileIds[i]}
-		}
-		index, found := indexForId[md.Id]
-		if !found {
-			return nil, &FileIdNotFoundError{fileIds[i]}
-		}
-		file := File{
-			Id:           md.Id,
-			Name:         md.Source.FileName,
-			Path:         md.Source.FilePath,
-			Date:         md.Source.Date,
-			AddedDate:    md.Source.AddedDate,
-			ModifiedDate: md.Source.ModifiedDate,
-			Size:         md.Source.FileSize,
-			Metadata:     md.Source.Metadata,
-			MD5Sum:       md.Source.MD5Sum,
-		}
-		resources[index] = dataResourceFromFile(file)
-		if resources[index].Path == "" || resources[index].Path == "/" { // permissions probem
-			return nil, &PermissionDeniedError{fileIds[index]}
+func (db *Database) Resources(fileIds []string) ([]frictionless.DataResource, error) {
+	// we use the /data_objects/{data_object_id} GET endpoint to retrieve metadata
+	// for individual files
+	// (see https://api.microbiomedata.org/docs#/find/find_data_object_by_id_data_objects__data_object_id__get)
+	// FIXME: this endpoint only allows the retrieval of metadata for a single file
+	// FIXME: this endpoint does not provide any credit metadata, nor a way to get it
+
+	resource := make([]frictionless.DataResource, len(fileIds))
+	for i, fileId := range fileIds {
+		body, err := db.get(fmt.Sprintf("data_objects/%s", fileId))
+		var dataObject DataObject
+		err = json.Unmarshal(body, &dataObject)
+		if err != nil {
+			return nil, err
 		}
-
-		// fill in holes where we can and patch up discrepancies
-		// NOTE: we don't retrieve hits.hits._source.file_type because it can be
-		// NOTE: either a string or an array of strings, and I'm just trying for a
-		// NOTE: solution
-		resources[index].Format = formatFromFileName(resources[index].Path)
-		resources[index].MediaType = mimeTypeFromFormatAndTypes(resources[index].Format, []string{})
+		resources[i] = db.dataResourceFromDataObject(dataObject)
 	}
 	return resources, err
 }
 
 func (db *Database) StageFiles(fileIds []string) (uuid.UUID, error) {
-  // NMDC keeps all of its NERSC data on disk, so all files are already staged.
-  // We simply generate a new UUID that can be handed to db.StagingStatus,
-  // which returns databases.StagingStatusSucceeded.
-  //
-  // "We may eventually use tape but don't need to yet." -Shreyas Cholia, 2024-09-04
-  return uuid.New(), nil
+	// NMDC keeps all of its NERSC data on disk, so all files are already staged.
+	// We simply generate a new UUID that can be handed to db.StagingStatus,
+	// which returns databases.StagingStatusSucceeded.
+	//
+	// "We may eventually use tape but don't need to yet." -Shreyas Cholia, 2024-09-04
+	return uuid.New(), nil
 }
 
 func (db *Database) StagingStatus(id uuid.UUID) (databases.StagingStatus, error) {
-  // all files are hot!
+	// all files are hot!
 	return databases.StagingStatusSucceeded, nil
 }