Skip to content

Commit 87e7eac

Browse files
author
renaud gaudin
committed
[WIP] libzim7 sotoki rewrite stub
rewrote preparation step: - mostly in python - better documentation - external-tools (p7zip, sort) are optionnal (but faster) - bundled domains listing tool into main program
1 parent e346ea7 commit 87e7eac

19 files changed

+1499
-109
lines changed

.gitignore

Lines changed: 132 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,134 @@
1-
*~
2-
*pyc
3-
db/
4-
dumps/
5-
\.#*
1+
2+
# Byte-compiled / optimized / DLL files
3+
__pycache__/
4+
*.py[cod]
5+
*$py.class
6+
7+
# C extensions
8+
*.so
9+
10+
# Distribution / packaging
11+
.Python
612
build/
7-
db.backup/
8-
_work
9-
work
10-
venv/
11-
sotoki.egg-info/
13+
develop-eggs/
1214
dist/
15+
downloads/
16+
eggs/
17+
.eggs/
18+
lib/
19+
lib64/
20+
parts/
21+
sdist/
22+
var/
23+
wheels/
24+
pip-wheel-metadata/
25+
share/python-wheels/
26+
*.egg-info/
27+
.installed.cfg
28+
*.egg
29+
MANIFEST
30+
31+
# PyInstaller
32+
# Usually these files are written by a python script from a template
33+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
34+
*.manifest
35+
*.spec
36+
37+
# Installer logs
38+
pip-log.txt
39+
pip-delete-this-directory.txt
40+
41+
# Unit test / coverage reports
42+
htmlcov/
43+
.tox/
44+
.nox/
45+
.coverage
46+
.coverage.*
47+
.cache
48+
nosetests.xml
49+
coverage.xml
50+
*.cover
51+
*.py,cover
52+
.hypothesis/
53+
.pytest_cache/
54+
55+
# Translations
56+
*.mo
57+
*.pot
58+
59+
# Django stuff:
60+
*.log
61+
local_settings.py
62+
db.sqlite3
63+
db.sqlite3-journal
64+
65+
# Flask stuff:
66+
instance/
67+
.webassets-cache
68+
69+
# Scrapy stuff:
70+
.scrapy
71+
72+
# Sphinx documentation
73+
docs/_build/
74+
75+
# PyBuilder
76+
target/
77+
78+
# Jupyter Notebook
79+
.ipynb_checkpoints
80+
81+
# IPython
82+
profile_default/
83+
ipython_config.py
84+
85+
# pyenv
86+
.python-version
87+
88+
# pipenv
89+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
90+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
91+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
92+
# install all needed dependencies.
93+
#Pipfile.lock
94+
95+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
96+
__pypackages__/
97+
98+
# Celery stuff
99+
celerybeat-schedule
100+
celerybeat.pid
101+
102+
# SageMath parsed files
103+
*.sage.py
104+
105+
# Environments
106+
.env
107+
.venv
108+
env/
109+
venv/
110+
ENV/
111+
env.bak/
112+
venv.bak/
113+
114+
# Spyder project settings
115+
.spyderproject
116+
.spyproject
117+
118+
# Rope project settings
119+
.ropeproject
120+
121+
# mkdocs documentation
122+
/site
123+
124+
# mypy
125+
.mypy_cache/
126+
.dmypy.json
127+
dmypy.json
128+
129+
# Pyre type checker
130+
.pyre/
131+
132+
.DS_Store
133+
134+
.dockerignore

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
### 2.0.0.dev0
2+
3+
- rewrite using python-libzim (libzim7)
4+
- added --list-all option to list all available stackexchange domains
5+
- added --preparation-only to only prepare XML files
6+
- faster XML dumps creation step (x5)
7+
18
### 1.3.2.dev0
29

310
* removed pre-generated identicons (#141)

Dockerfile

Lines changed: 17 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,23 @@
1-
FROM python:3.8
1+
FROM python:3.8-slim
22

3-
# Install necessary packages
4-
RUN apt-get update -y && \
5-
apt-get install -y --no-install-recommends advancecomp libxml2-dev libxslt1-dev libbz2-dev p7zip-full gif2apng imagemagick libjpeg-dev libpng-dev locales && \
6-
apt-get clean && \
7-
rm -rf /var/lib/apt/lists/*
3+
RUN apt-get update -y \
4+
&& apt-get install -y --no-install-recommends unzip p7zip tzdata wget \
5+
&& apt-get clean \
6+
&& rm -rf /var/lib/apt/lists/*
87

9-
# Install jpegoptim
10-
RUN wget http://www.kokkonen.net/tjko/src/jpegoptim-1.4.6.tar.gz && \
11-
tar xvf jpegoptim-1.4.6.tar.gz && \
12-
cd jpegoptim-1.4.6 && ./configure && make all install && \
13-
rm -rf jpegoptim-1.4.6*
8+
RUN echo "UTC" > /etc/timezone
9+
ENV LANG en_US.UTF-8
10+
ENV LANGUAGE en_US:en
11+
ENV LC_ALL en_US.UTF-8
1412

15-
# Install pngquant
16-
RUN wget http://pngquant.org/pngquant-2.12.5-src.tar.gz && \
17-
tar xvf pngquant-2.12.5-src.tar.gz && \
18-
cd pngquant-2.12.5 && ./configure && make all install && \
19-
rm -rf pngquant-2.12.5*
13+
# TEMP: install pylibzim and scraperlib through built wheels (until release)
14+
RUN wget http://tmp.kiwix.org/wheels/libzim-1.0.0.dev0-cp38-cp38-manylinux1_x86_64.whl \
15+
&& wget http://tmp.kiwix.org/wheels/zimscraperlib-1.4.0.dev0-py3-none-any.whl \
16+
&& pip install --no-cache-dir *.whl
2017

21-
# Install gifsicle
22-
RUN wget https://www.lcdf.org/gifsicle/gifsicle-1.92.tar.gz && \
23-
tar xvf gifsicle-1.92.tar.gz && \
24-
cd gifsicle-1.92 && ./configure && make all install && \
25-
rm -rf gifsicle-1.92*
26-
27-
# Install sotoki
28-
RUN locale-gen "en_US.UTF-8"
2918
COPY requirements.txt /tmp/requirements.txt
30-
RUN pip3 install -r /tmp/requirements.txt
31-
COPY . /app
32-
WORKDIR /app
33-
RUN python3 setup.py install
34-
WORKDIR /
35-
RUN rm -rf /app
19+
RUN pip install --no-cache-dir -U pip && pip install --no-cache-dir -r /tmp/requirements.txt
20+
COPY . /app/
21+
RUN cd /app && python setup.py install && cd - && rm -rf /app
3622

37-
# Boot commands
38-
CMD sotoki ; /bin/bash
23+
CMD ["sotoki", "--help"]

MANIFEST.in

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
1+
graft src
12
include *.md
2-
recursive-include sotoki *.js *.md *.txt *.sh *.css *.svg *.eot *.ttf *.woff *.woff2 *.html VERSION
3+
include requirements.txt
4+
include LICENSE

README.md

Lines changed: 21 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,58 +1,41 @@
1-
# Sotoki
1+
Sotoki
2+
========
23

3-
*Stack Overflow to Kiwix*
4+
`sotoki` (*stackoverflow to kiwix*) is an [OpenZIM](https://github.com/openzim) scraper to create offline versions of [Stack Exchange](https://stackexchange.com) websites such as [stack overflow](https://stackoverflow.com/).
45

5-
The goal of this project is to create a suite of tools to create
6-
[zim](https://openzim.org) files required by
7-
[kiwix](https://kiwix.org/) reader to make available [Stack Overflow](https://stackoverflow.com/)
8-
offline (without access to Internet). This use stackexchange dump from [Stack Exchange Data Dump](https://archive.org/details/stackexchange)
6+
It is based on Stack Exchange's Data Dumps hosted by [The Internet Archive](https://archive.org/download/stackexchange/).
97

10-
[![PyPI](https://img.shields.io/pypi/v/sotoki.svg)](https://pypi.python.org/pypi/sotoki)
11-
[![Docker Build Status](https://img.shields.io/docker/build/openzim/sotoki)](https://hub.docker.com/r/openzim/sotoki)
128
[![CodeFactor](https://www.codefactor.io/repository/github/openzim/sotoki/badge)](https://www.codefactor.io/repository/github/openzim/sotoki)
9+
[![Docker](https://img.shields.io/docker/v/openzim/sotoki?label=docker&sort=semver)](https://hub.docker.com/r/openzim/sotoki)
1310
[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
11+
[![PyPI version shields.io](https://img.shields.io/pypi/v/sotoki.svg)](https://pypi.org/project/sotoki/)
1412

15-
## Getting started
13+
## ⚠️ Warning
1614

17-
The use of btrfs as a file system is recommended (and required for stackoverflow)
15+
`sotoki` is undergoing a major rewrite to use libzim7 and its python binding in order to bypass filesystem limitations seen in version `1.x`. Use tagged version until this warning is removed as **current master is not-functionnal**.
1816

19-
Install non python dependencies:
20-
```bash
21-
sudo apt-get install jpegoptim pngquant gifsicle advancecomp python-pip python-virtualenv python-dev libxml2-dev libxslt1-dev libbz2-dev p7zip-full python-pillow gif2apng imagemagick
22-
```
23-
24-
Create a virtual environment for python:
25-
```bash
26-
virtualenv --system-site-packages -p python3 ./
27-
```
17+
## Usage
2818

29-
Activate the virtual enviroment:
30-
```bash
31-
source ./bin/activate
32-
```
19+
`sotoki` works off a `domain` that you must provide. That is the domain-name of the stackexchange website you want to scrape. Run `sotoki --list-all` to get a list of those
3320

34-
Install this lib:
35-
```bash
36-
pip3 install sotoki
37-
```
21+
### Docker
3822

39-
Usage:
4023
```bash
41-
sotoki <domain> <publisher> [--directory=<dir>] [--nozim] [--tag-depth=<tag_depth>] [--threads=<threads>] [--zimpath=<zimpath>] [--reset] [--reset-images] [--clean-previous] [--nofulltextindex] [--ignoreoldsite] [--nopic] [--no-userprofile]
24+
docker run -v my_dir:/output openzim/sotoki sotoki --help
4225
```
4326

44-
You can use `sotoki -h` to have more explanation about these options
27+
### Virtualenv
4528

46-
## Example
29+
`sotoki` is a Python3 software. If you are not using the [Docker](https://docker.com) image, you are advised to use it in a virtual environment to avoid installing software dependencies on your system.
4730

4831
```bash
49-
for S in `./list_all.sh`
50-
do
51-
sotoki $S Kiwix --threads=12 --reset --clean-previous --no-userprofile
52-
done
32+
python3 -m venv env # Create virtualenv
33+
source env/bin/Activate # Activate the virtualenv
34+
pip3 install sotoki # Install dependencies
35+
sotoki --help # Display kolibri2zim help
5336
```
5437

55-
## License
38+
Call `deactivate` to quit the virtual environment.
39+
40+
See `requirements.txt` for the list of python dependencies.
5641

57-
[GPLv3](https://www.gnu.org/licenses/gpl-3.0) or later, see
58-
[LICENSE](LICENSE) for more details.

requirements.txt

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,6 @@
1-
Jinja2==2.11.1
2-
lxml>=4.5.2,<4.6
3-
MarkupSafe==1.1.1
4-
docopt==0.6.2
5-
python-slugify==4.0.0
6-
beautifulsoup4==4.9.1
7-
mistune>=2.0.0a3
8-
Pillow==7.1.1
9-
kiwixstorage>=0.2,<1.0
10-
pif==0.8.2
11-
zimscraperlib>=1.3.1,<1.4
12-
1+
kiwixstorage>=0.7,<1.0
2+
pif>=0.8.2,<0.9
3+
zimscraperlib>=1.4.0.dev0,<1.5
4+
xml_to_dict>=0.1.6,<0.2
5+
cli-formatter>=1.2.0,<1.3
6+
py7zr>=0.16.1,<0.17

setup.py

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1-
import pathlib
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
# vim: ai ts=4 sts=4 et sw=4 nu
24

5+
import pathlib
36
from setuptools import setup, find_packages
47

58
root_dir = pathlib.Path(__file__).parent
6-
with open(root_dir.joinpath("requirements.txt"), "r") as fh:
7-
requirements = fh.read()
89

910

1011
def read(*names, **kwargs):
@@ -14,28 +15,35 @@ def read(*names, **kwargs):
1415

1516
setup(
1617
name="sotoki",
17-
version=read("sotoki", "VERSION").strip(),
18-
description="Make zimfile from stackexchange dump",
19-
long_description=open("README.md").read(),
20-
author="dattaz",
21-
author_email="[email protected]",
22-
url="http://github.com/kiwix/sotoki",
23-
keywords="kiwix zim stackexchange offline",
24-
license="GPL",
25-
packages=find_packages(exclude=["contrib", "docs", "tests*"]),
18+
version=read("src", "sotoki", "VERSION").strip(),
19+
description="Turn StackExchange dumps into ZIM files for offline usage",
20+
long_description=read("README.md"),
21+
long_description_content_type="text/markdown",
22+
author="Kiwix",
23+
author_email="[email protected]",
24+
url="https://github.com/openzim/sotoki",
25+
keywords="kiwix zim offline stackechange stackoverflow",
26+
license="GPLv3+",
27+
packages=find_packages("src"),
28+
package_dir={"": "src"},
2629
install_requires=[
2730
line.strip()
28-
for line in requirements.splitlines()
31+
for line in read("requirements.txt").splitlines()
2932
if not line.strip().startswith("#")
3033
],
3134
zip_safe=False,
32-
platforms="Linux",
3335
include_package_data=True,
34-
entry_points={"console_scripts": ["sotoki=sotoki.sotoki:run"]},
36+
entry_points={
37+
"console_scripts": [
38+
"sotoki=sotoki.__main__:main",
39+
]
40+
},
3541
classifiers=[
3642
"Development Status :: 4 - Beta",
3743
"Intended Audience :: Developers",
3844
"Programming Language :: Python",
39-
"Programming Language :: Python :: 3",
45+
"Programming Language :: Python :: 3.9",
46+
"License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
4047
],
48+
python_requires=">=3.6",
4149
)

src/sotoki/VERSION

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
2.0.0.dev0

src/sotoki/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)