-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathDockerfile.datastet
165 lines (127 loc) · 5.75 KB
/
Dockerfile.datastet
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# -------------------
# build builder image
# -------------------
FROM openjdk:17-jdk-slim as builder
USER root
ARG GROBID_VERSION
RUN apt-get update && \
apt-get -y --no-install-recommends install unzip
WORKDIR /opt/grobid-source
# gradle
COPY gradle/ ./gradle/
COPY gradlew ./
COPY gradle.properties ./
COPY build.gradle ./
COPY settings.gradle ./
# source
COPY grobid-home/ ./grobid-home/
COPY grobid-core/ ./grobid-core/
COPY grobid-trainer/ ./grobid-trainer/
COPY datastet/ ./datastet/
# cleaning unused native libraries before packaging
RUN rm -rf grobid-home/pdf2xml
RUN rm -rf grobid-home/pdfalto/lin-32
RUN rm -rf grobid-home/pdfalto/mac-64
RUN rm -rf grobid-home/pdfalto/win-*
RUN rm -rf grobid-home/lib/lin-32
RUN rm -rf grobid-home/lib/win-*
RUN rm -rf grobid-home/lib/mac-64
RUN ./gradlew clean assemble install --no-daemon --info --stacktrace
WORKDIR ./datastet/
RUN ./gradlew clean install --no-daemon --info --stacktrace
WORKDIR /opt/grobid
RUN unzip -o /opt/grobid-source/grobid-home/build/distributions/grobid-home-*.zip && \
chmod -R 755 /opt/grobid/grobid-home/pdfalto
RUN rm -rf grobid-source
# -------------------
# build runtime image
# -------------------
# use NVIDIA Container Toolkit to automatically recognize possible GPU drivers on the host machine
FROM tensorflow/tensorflow:2.7.0-gpu
CMD nvidia-smi
# setting locale is likely useless but to be sure
ENV LANG C.UTF-8
# update NVIDIA Cuda key (following a key rotation in April 2022)
RUN apt-get install -y wget
RUN apt-key del 7fa2af80
RUN rm /etc/apt/sources.list.d/cuda.list
RUN rm /etc/apt/sources.list.d/nvidia-ml.list
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
RUN dpkg -i cuda-keyring_1.0-1_all.deb
# Add Tini
ENV TINI_VERSION v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "-s", "--"]
# install JRE, python and other dependencies
RUN apt-get update && \
apt-get -y --no-install-recommends install apt-utils build-essential gcc libxml2 libfontconfig unzip curl \
openjdk-17-jre-headless openjdk-17-jdk ca-certificates-java \
musl gfortran \
python3 python3-pip python3-setuptools python3-dev \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /opt/grobid
COPY --from=builder /opt/grobid .
RUN python3 -m pip install pip --upgrade
# install DeLFT via pypi
RUN pip3 install requests delft==0.3.3
# link the data directory to /data
# the current working directory will most likely be /opt/grobid
RUN mkdir -p /data \
&& ln -s /data /opt/grobid/data \
&& ln -s /data ./data
# disable python warnings (and fix logging)
ENV PYTHONWARNINGS="ignore"
WORKDIR /opt/grobid
ENV JAVA_OPTS=-Xmx4g
# install jep (and temporarily the matching JDK)
ENV JDK_URL=https://download.java.net/java/GA/jdk17.0.2/dfd4a8d0985749f896bed50d7138ee7f/8/GPL/openjdk-17.0.2_linux-x64_bin.tar.gz
RUN curl --fail --show-error --location -q ${JDK_URL} -o /tmp/openjdk.tar.gz
RUN mkdir /tmp/jdk-17
RUN tar xvfz /tmp/openjdk.tar.gz --directory /tmp/jdk-17 --strip-components 1 --no-same-owner
RUN /tmp/jdk-17/bin/javac -version
RUN JAVA_HOME=/tmp/jdk-17 pip3 install jep==4.0.2
RUN rm -f /tmp/openjdk.tar.gz
RUN rm -rf /tmp/jdk-17
ENV LD_LIBRARY_PATH=/usr/local/lib/python3.8/dist-packages/jep:grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep:${LD_LIBRARY_PATH}
# remove libjep.so because we are providing our own version in the virtual env above
RUN rm /opt/grobid/grobid-home/lib/lin-64/jep/libjep.so
# preload embeddings if needed, for GROBID all the RNN models use glove-840B (default for the script), ELMo is currently not loaded
# download GROBID fine-tuned models based on SciBERT if selected
COPY --from=builder /opt/grobid-source/grobid-home/scripts/preload_embeddings.py .
# embeddings if needed will be loaded when building and running tests
RUN ln -s /opt/grobid /opt/delft
COPY --from=builder /opt/grobid-source/datastet /opt/grobid/datastet
COPY --from=builder /root/.m2/repository/org /opt/grobid/datastet/lib/org
# install Pub2TEI
WORKDIR /opt/
RUN wget https://github.com/kermitt2/Pub2TEI/archive/refs/heads/master.zip
RUN unzip master.zip
RUN mv Pub2TEI-master Pub2TEI
WORKDIR /opt/grobid/datastet
RUN mkdir /opt/grobid/delft
RUN mkdir /opt/grobid/delft/delft
COPY --from=builder /opt/grobid-source/grobid-home/config/resources-registry.json /opt/grobid/delft/delft/resources-registry.json
WORKDIR /opt/grobid/datastet
# trigger gradle wrapper install
RUN ./gradlew --version
RUN ./gradlew installModels && rm -rf resources/models && rm ../grobid-home/models/dataseer*.zip && rm ../grobid-home/models/context_*.zip
# install ELMo, if needed (do not delete)
#RUN wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json
#RUN wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5
#RUN mkdir /opt/elmo
#RUN mv elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json /opt/elmo/
#RUN mv elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5 /opt/elmo/
# this will build and load embeddings on the image forever (only if required by the config) :)
WORKDIR /opt/grobid/datastet
#RUN ./gradlew clean build test
RUN ./gradlew clean assemble --no-daemon --stacktrace --info -x test
#CMD ["./gradlew", "run"]
CMD ["sh", "-c", "java --add-opens java.base/java.lang=ALL-UNNAMED -jar build/libs/datastet-0.8.1-onejar.jar server resources/config/config.yml"]
LABEL \
authors="The contributors" \
org.label-schema.name="datastet" \
org.label-schema.description="Image with DataStet service" \
org.label-schema.url="https://github.com/kermitt2/datastet" \
org.label-schema.version=${GROBID_VERSION}