BiocPy
diff --git a/‎.github/workflows/run-tests.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/run-tests.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 2 additions & 4 deletions b/‎CHANGELOG.md‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎README.md‎
Lines changed: 57 additions & 3 deletions b/‎README.md‎
Lines changed: 57 additions & 3 deletions
diff --git a/‎docs/conf.py‎
Lines changed: 1 addition & 0 deletions b/‎docs/conf.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/index.md‎
Lines changed: 9 additions & 13 deletions b/‎docs/index.md‎
Lines changed: 9 additions & 13 deletions
diff --git a/‎lib/CMakeLists.txt‎
Lines changed: 23 additions & 0 deletions b/‎lib/CMakeLists.txt‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎lib/src/init.cpp‎
Lines changed: 11 additions & 0 deletions b/‎lib/src/init.cpp‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎lib/src/stringsetpool.cpp‎
Lines changed: 57 additions & 0 deletions b/‎lib/src/stringsetpool.cpp‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎setup.cfg‎
Lines changed: 6 additions & 4 deletions b/‎setup.cfg‎
Lines changed: 6 additions & 4 deletions
@@ -28,11 +28,11 @@ jobs:
   test:
     strategy:
       matrix:
-        python: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+        python: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"]
         platform:
           - ubuntu-latest
           - macos-latest
-          - windows-latest
+          # - windows-latest
     runs-on: ${{ matrix.platform }}
     name: Python ${{ matrix.python }}, ${{ matrix.platform }}
     steps:
 
@@ -1,7 +1,5 @@
 # Changelog
 
-## Version 0.1 (development)
+## Version 0.0.1
 
-- Feature A added
-- FIX: nasty bug #1729 fixed
-- add your changes here!
+- Initial implementation, added the DNAString and DNAStringSet classes.
@@ -1,11 +1,15 @@
 [![PyPI-Server](https://img.shields.io/pypi/v/biostrings.svg)](https://pypi.org/project/biostrings/)
-![Unit tests](https://github.com/YOUR_ORG_OR_USERNAME/biostrings/actions/workflows/run-tests.yml/badge.svg)
+![Unit tests](https://github.com/biocpy/biostrings/actions/workflows/run-tests.yml/badge.svg)
 
 # biostrings
 
-> representations for dna strings
+Efficient manipulation of genomic sequences in Python, inspired by the design of Bioconductor's [Biostrings](https://bioconductor.org/packages/Biostrings) package.
 
-A longer description of your project goes here...
+The core design relies on a **"pool and ranges"** memory model:
+
+- **DNAStringSet** stores all sequences in a single contiguous block of memory (the pool).
+- Individual sequences are defined by `start` and `width` coordinates (the ranges).
+- Slicing a `DNAStringSet` returns a **view** (a new set of ranges pointing to the same pool), making subsetting operations virtually instantaneous and memory-free, regardless of the data size.
 
 ## Install
 
@@ -15,6 +19,56 @@ To get started, install the package from [PyPI](https://pypi.org/project/biostri
 pip install biostrings
 ```
 
+## Quick Start
+
+### Working with Single Sequences
+
+The `DNAString` class represents a single DNA sequence. It enforces the IUPAC DNA alphabet and supports efficient byte-level operations.
+
+```py
+from biostrings import DNAString
+
+# Create a DNA string
+dna = DnaString("TTGAAAA-CTC-N")
+print(dna)
+# Output: TTGAAAA-CTC-N
+
+# Basic operations
+print(len(dna))            # 13
+print(dna[0:3])            # DnaString(length=3, sequence='TTG')
+
+# Reverse Complement
+# Handles IUPAC ambiguity codes correctly (e.g., N -> N, M -> K)
+rc = dna.reverse_complement()
+print(rc)
+# Output: N-GAG-TTTTCAA
+```
+
+### Working with Sets of Sequences
+
+The `DNAStringSet` is the primary container for handling collections of sequences (e.g., reads from a FASTA file).
+
+```py
+from biostrings import DNAStringSet
+
+# Efficiently create a set from a list of strings
+seqs = [
+    "ACGT",
+    "GATTACA",
+    "TTGAAAA-CTC-N",
+    "ACGTACGT"
+]
+dss = DNAStringSet(seqs, names=["s1", "s2", "s3", "s4"])
+
+print(dss)
+# Output:
+# <DNAStringSet of length 4>
+#   [ 1]   4 ACGT                 s1
+#   [ 2]   7 GATTACA              s2
+#   [ 3]  13 TTGAAAA-CTC-N        s3
+#   [ 4]   8 ACGTACGT             s4
+```
+
 <!-- biocsetup-notes -->
 
 ## Note
 
@@ -299,6 +299,7 @@
     "scipy": ("https://docs.scipy.org/doc/scipy/reference", None),
     "setuptools": ("https://setuptools.pypa.io/en/stable/", None),
     "pyscaffold": ("https://pyscaffold.org/en/stable", None),
+    "iranges": ("https://biocpy.github.io/IRanges", None),
 }
 
 print(f"loading configurations for {project} {version} ...", file=sys.stderr)
 
@@ -1,18 +1,14 @@
 # biostrings
 
-representations for dna strings
+Efficient manipulation of genomic sequences in Python, inspired by the design of Bioconductor's [Biostrings](https://bioconductor.org/packages/Biostrings) package.
 
+## Install
 
-## Note
-
-> This is the main page of your project's [Sphinx] documentation. It is
-> formatted in [Markdown]. Add additional pages by creating md-files in
-> `docs` or rst-files (formatted in [reStructuredText]) and adding links to
-> them in the `Contents` section below.
->
-> Please check [Sphinx] and [MyST] for more information
-> about how to document your project and how to configure your preferences.
+To get started, install the package from [PyPI](https://pypi.org/project/biostrings/)
 
+```bash
+pip install biostrings
+```
 
 ## Contents
 
@@ -29,9 +25,9 @@ Module Reference <api/modules>
 
 ## Indices and tables
 
-* {ref}`genindex`
-* {ref}`modindex`
-* {ref}`search`
+- {ref}`genindex`
+- {ref}`modindex`
+- {ref}`search`
 
 [Sphinx]: http://www.sphinx-doc.org/
 [Markdown]: https://daringfireball.net/projects/markdown/
 
@@ -0,0 +1,23 @@
+cmake_minimum_required(VERSION 3.24)
+
+project(biostring
+    VERSION 1.0.0
+    DESCRIPTION "Building the biostrings shared library"
+    LANGUAGES CXX)
+
+find_package(pybind11 CONFIG)
+
+# pybind11 method:
+pybind11_add_module(biostring
+    src/stringsetpool.cpp
+    src/init.cpp
+)
+
+set_property(TARGET biostring PROPERTY CXX_STANDARD 17)
+
+target_link_libraries(biostring PRIVATE pybind11::pybind11)
+
+set_target_properties(biostring PROPERTIES
+    OUTPUT_NAME lib_biostrings
+    PREFIX ""
+)
@@ -0,0 +1,11 @@
+#include "pybind11/pybind11.h"
+
+namespace py = pybind11;
+
+void init_stringsetpool(pybind11::module &);
+
+PYBIND11_MODULE(lib_iranges, m) {
+    m.doc() = "cpp implementations";
+
+    init_stringsetpool(m);
+}
@@ -0,0 +1,57 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/numpy.h>
+#include <string>
+#include <vector>
+#include <sstream>
+#include <cctype>
+#include <stdexcept>
+
+namespace py = pybind11;
+
+// Equivalent to R's new_XStringSet_from_CHARACTER C-function
+// Returns: (pool (bytes), starts (numpy array), widths (numpy array))
+py::tuple create_dnastringset_pool(py::list py_seqs) {
+    size_t n = py_seqs.size();
+
+    py::array_t<int32_t> np_starts(n);
+    py::array_t<int32_t> np_widths(n);
+
+    int32_t* starts_ptr = np_starts.mutable_data();
+    int32_t* widths_ptr = np_widths.mutable_data();
+    
+    std::stringstream pool_stream;
+    int32_t current_start = 0;
+    const std::string valid_chars = "ACGTRYSWKMBDHVN-";
+
+    for (size_t i = 0; i < n; ++i) {
+        std::string s = py_seqs[i].cast<std::string>();
+        int32_t current_width = static_cast<int32_t>(s.length());
+        starts_ptr[i] = current_start;
+        widths_ptr[i] = current_width;
+
+        for (char &c : s) {
+            c = std::toupper(c);
+            if (valid_chars.find(c) == std::string::npos) {
+                throw std::invalid_argument(
+                    "Sequence " + std::to_string(i) + " contains invalid DNA character: " + c
+                );
+            }
+        }
+        
+        pool_stream.write(s.c_str(), current_width);
+        current_start += current_width;
+    }
+    
+    py::bytes pool = py::bytes(pool_stream.str());
+    return py::make_tuple(pool, np_starts, np_widths);
+}
+
+void init_stringsetpool(pybind11::module &m) {
+    m.doc() = "C++ extensions for biostrings";
+    m.def(
+        "create_dnastringset_pool",
+        &create_dnastringset_pool,
+        "Efficiently create the pool and ranges for a DnaStringset from a list of strings."
+    );
+}
@@ -1,6 +1,6 @@
 [build-system]
 # AVOID CHANGING REQUIRES: IT WILL BE UPDATED BY PYSCAFFOLD!
-requires = ["setuptools>=46.1.0", "setuptools_scm[toml]>=5"]
+requires = ["setuptools>=46.1.0", "setuptools_scm[toml]>=5", "cmake", "pybind11", "numpy"]
 build-backend = "setuptools.build_meta"
 
 [tool.setuptools_scm]
@@ -11,7 +11,7 @@ version_scheme = "no-guess-dev"
 [tool.ruff]
 line-length = 120
 src = ["src"]
-exclude = ["tests"]
+# exclude = ["tests"]
 lint.extend-ignore = ["F821"]
 
 [tool.ruff.lint.pydocstyle]
 
@@ -5,7 +5,7 @@
 
 [metadata]
 name = biostrings
-description = representations for dna strings
+description = Efficient manipulation of genomic sequences
 author = Jayaram Kancherla
 author_email = [email protected]
 license = MIT
@@ -15,8 +15,8 @@ long_description_content_type = text/markdown; charset=UTF-8; variant=GFM
 url = https://github.com/pyscaffold/pyscaffold/
 # Add here related links, for example:
 project_urls =
-    Documentation = https://pyscaffold.org/
-#    Source = https://github.com/pyscaffold/pyscaffold/
+    Documentation = https://github.com/BiocPy/biostrings
+   Source = https://github.com/BiocPy/biostrings
 #    Changelog = https://pyscaffold.org/en/latest/changelog.html
 #    Tracker = https://github.com/pyscaffold/pyscaffold/issues
 #    Conda-Forge = https://anaconda.org/conda-forge/pyscaffold
@@ -41,14 +41,16 @@ package_dir =
     =src
 
 # Require a min/specific Python version (comma-separated conditions)
-# python_requires = >=3.8
+python_requires = >=3.9
 
 # Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0.
 # Version specifiers like >=2.2,<3.0 avoid problems due to API changes in
 # new major versions. This works if the required packages follow Semantic Versioning.
 # For more information, check out https://semver.org/.
 install_requires =
     importlib-metadata; python_version<"3.8"
+    iranges
+    numpy
 
 
 [options.packages.find]
Original file line number	Diff line number	Diff line change
`@@ -299,6 +299,7 @@`
`299`	`299`	`"scipy": ("https://docs.scipy.org/doc/scipy/reference", None),`
`300`	`300`	`"setuptools": ("https://setuptools.pypa.io/en/stable/", None),`
`301`	`301`	`"pyscaffold": ("https://pyscaffold.org/en/stable", None),`
	`302`	`+ "iranges": ("https://biocpy.github.io/IRanges", None),`
`302`	`303`	`}`
`303`	`304`
`304`	`305`	`print(f"loading configurations for {project} {version} ...", file=sys.stderr)`