Commit 87f2744166643eda4b73deb53b565916bd07c301
1 parent
badd76e0
Update docker configuration.
Showing
10 changed files
with
87 additions
and
356802 deletions
Too many changes to show.
To preserve performance only 9 of 10 files are displayed.
Dockerfile
@@ -14,7 +14,8 @@ ENV PYTHONUNBUFFERED 1 | @@ -14,7 +14,8 @@ ENV PYTHONUNBUFFERED 1 | ||
14 | 14 | ||
15 | # install dependencies | 15 | # install dependencies |
16 | RUN apt-get update && \ | 16 | RUN apt-get update && \ |
17 | - apt-get install -y python3-pip | 17 | + apt-get install -y python3-pip && \ |
18 | + pip3 install --upgrade pip | ||
18 | COPY ./requirements.txt /usr/src/collector/requirements.txt | 19 | COPY ./requirements.txt /usr/src/collector/requirements.txt |
19 | RUN pip3 wheel --no-cache-dir --no-deps --wheel-dir /usr/src/collector/wheels -r requirements.txt | 20 | RUN pip3 wheel --no-cache-dir --no-deps --wheel-dir /usr/src/collector/wheels -r requirements.txt |
20 | 21 | ||
@@ -30,33 +31,35 @@ FROM ubuntu:18.04 | @@ -30,33 +31,35 @@ FROM ubuntu:18.04 | ||
30 | RUN apt-get update && \ | 31 | RUN apt-get update && \ |
31 | apt-get install -y locales && \ | 32 | apt-get install -y locales && \ |
32 | locale-gen pl_PL.UTF-8 | 33 | locale-gen pl_PL.UTF-8 |
34 | + | ||
35 | +# set envs | ||
33 | ENV LANG pl_PL.UTF-8 | 36 | ENV LANG pl_PL.UTF-8 |
34 | ENV LC_ALL pl_PL.UTF-8 | 37 | ENV LC_ALL pl_PL.UTF-8 |
38 | +ENV HOME=/home/collector | ||
39 | +ENV APP_HOME=/home/collector/app | ||
35 | 40 | ||
36 | -# create directory for the collector user | ||
37 | -RUN mkdir -p /home/collector | ||
38 | - | ||
39 | -# create the collector user | ||
40 | -RUN addgroup --group collector && adduser collector --ingroup collector | 41 | +# create directory for the collector user and user itself |
42 | +RUN mkdir -p $HOME && \ | ||
43 | + addgroup --group collector && \ | ||
44 | + adduser collector --ingroup collector | ||
41 | 45 | ||
42 | # create the appropriate directories | 46 | # create the appropriate directories |
43 | -ENV HOME=/home/collector | ||
44 | -ENV APP_HOME=/home/collector/app | ||
45 | RUN mkdir $APP_HOME | 47 | RUN mkdir $APP_HOME |
46 | WORKDIR $APP_HOME | 48 | WORKDIR $APP_HOME |
47 | 49 | ||
48 | # install dependencies | 50 | # install dependencies |
49 | COPY --from=builder /usr/src/collector/wheels /wheels | 51 | COPY --from=builder /usr/src/collector/wheels /wheels |
50 | COPY --from=builder /usr/src/collector/requirements.txt . | 52 | COPY --from=builder /usr/src/collector/requirements.txt . |
51 | -RUN apt-get install -y netcat openjdk-8-jre python3-pip software-properties-common wget && \ | 53 | +RUN apt-get update && apt-get install -y netcat openjdk-8-jre python3-pip software-properties-common wget && \ |
52 | wget -O - http://download.sgjp.pl/apt/sgjp.gpg.key|apt-key add - && \ | 54 | wget -O - http://download.sgjp.pl/apt/sgjp.gpg.key|apt-key add - && \ |
53 | apt-add-repository http://download.sgjp.pl/apt/ubuntu && \ | 55 | apt-add-repository http://download.sgjp.pl/apt/ubuntu && \ |
54 | wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - && \ | 56 | wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - && \ |
55 | echo "deb http://apt.postgresql.org/pub/repos/apt/ `lsb_release -cs`-pgdg main" |tee /etc/apt/sources.list.d/pgdg.list && \ | 57 | echo "deb http://apt.postgresql.org/pub/repos/apt/ `lsb_release -cs`-pgdg main" |tee /etc/apt/sources.list.d/pgdg.list && \ |
56 | apt-get update && \ | 58 | apt-get update && \ |
57 | - apt-get install -y morfeusz2 python3-morfeusz2 && \ | ||
58 | - DEBIAN_FRONTEND="noninteractive" apt-get -y install postgresql-12 | ||
59 | -RUN pip3 install --no-cache /wheels/* | 59 | + apt-get install -y morfeusz2 python3-morfeusz2 libopenblas-dev libomp-dev && \ |
60 | + DEBIAN_FRONTEND="noninteractive" apt-get -y install postgresql-12 && \ | ||
61 | + pip3 install --upgrade pip && \ | ||
62 | + pip3 install --no-cache /wheels/* | ||
60 | 63 | ||
61 | # copy project | 64 | # copy project |
62 | COPY . $APP_HOME | 65 | COPY . $APP_HOME |
@@ -65,17 +68,29 @@ COPY . $APP_HOME | @@ -65,17 +68,29 @@ COPY . $APP_HOME | ||
65 | WORKDIR ./tools/liner2/g419-external-dependencies | 68 | WORKDIR ./tools/liner2/g419-external-dependencies |
66 | RUN tar -xvf ./CRF++-0.57.tar.gz | 69 | RUN tar -xvf ./CRF++-0.57.tar.gz |
67 | WORKDIR ./CRF++-0.57 | 70 | WORKDIR ./CRF++-0.57 |
68 | -RUN ./configure | ||
69 | -RUN make | ||
70 | -RUN make install | ||
71 | -RUN ldconfig | 71 | +RUN ./configure && \ |
72 | + make && \ | ||
73 | + make install && \ | ||
74 | + make clean && \ | ||
75 | + ldconfig | ||
76 | + | ||
77 | +# install eurobert | ||
78 | +WORKDIR $APP_HOME/tools | ||
79 | +RUN wget https://manage.legis.nlp.ipipan.waw.pl/download/marcell/eurobert-model.tar.gz && \ | ||
80 | + tar -xvf ./eurobert-model.tar.gz && \ | ||
81 | + rm ./eurobert-model.tar.gz | ||
82 | + | ||
83 | +# install labse | ||
84 | +RUN wget https://manage.legis.nlp.ipipan.waw.pl/download/marcell/labse-model.tar.gz && \ | ||
85 | + tar -xvf ./labse-model.tar.gz && \ | ||
86 | + rm ./labse-model.tar.gz | ||
72 | WORKDIR $APP_HOME | 87 | WORKDIR $APP_HOME |
73 | 88 | ||
74 | # copy django settings | 89 | # copy django settings |
75 | COPY ./collector/collector/docker-settings.py $APP_HOME/collector/collector/settings.py | 90 | COPY ./collector/collector/docker-settings.py $APP_HOME/collector/collector/settings.py |
76 | 91 | ||
77 | # chown all the files to the collector user | 92 | # chown all the files to the collector user |
78 | -RUN chown -R collector:collector $APP_HOME | 93 | +RUN chown -R collector:collector $HOME |
79 | 94 | ||
80 | # change to the collector user | 95 | # change to the collector user |
81 | USER collector | 96 | USER collector |
Dockerfile.marcell
@@ -14,7 +14,8 @@ ENV PYTHONUNBUFFERED 1 | @@ -14,7 +14,8 @@ ENV PYTHONUNBUFFERED 1 | ||
14 | 14 | ||
15 | # install dependencies | 15 | # install dependencies |
16 | RUN apt-get update && \ | 16 | RUN apt-get update && \ |
17 | - apt-get install -y python3-pip | 17 | + apt-get install -y python3-pip && \ |
18 | + pip3 install --upgrade pip | ||
18 | COPY ./requirements.txt /usr/src/collector/requirements.txt | 19 | COPY ./requirements.txt /usr/src/collector/requirements.txt |
19 | RUN pip3 wheel --no-cache-dir --no-deps --wheel-dir /usr/src/collector/wheels -r requirements.txt | 20 | RUN pip3 wheel --no-cache-dir --no-deps --wheel-dir /usr/src/collector/wheels -r requirements.txt |
20 | 21 | ||
@@ -38,8 +39,8 @@ ENV HOME=/home/collector | @@ -38,8 +39,8 @@ ENV HOME=/home/collector | ||
38 | ENV APP_HOME=/home/collector/app | 39 | ENV APP_HOME=/home/collector/app |
39 | 40 | ||
40 | # create directory for the collector user and user itself | 41 | # create directory for the collector user and user itself |
41 | -RUN mkdir -p $HOME && \ | ||
42 | - addgroup --group collector && \ | 42 | +RUN mkdir -p $HOME && \ |
43 | + addgroup --group collector && \ | ||
43 | adduser collector --ingroup collector | 44 | adduser collector --ingroup collector |
44 | 45 | ||
45 | # create the appropriate directories | 46 | # create the appropriate directories |
@@ -55,8 +56,9 @@ RUN apt-get update && apt-get install -y openjdk-8-jre python3-pip software-prop | @@ -55,8 +56,9 @@ RUN apt-get update && apt-get install -y openjdk-8-jre python3-pip software-prop | ||
55 | wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - && \ | 56 | wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - && \ |
56 | echo "deb http://apt.postgresql.org/pub/repos/apt/ `lsb_release -cs`-pgdg main" |tee /etc/apt/sources.list.d/pgdg.list && \ | 57 | echo "deb http://apt.postgresql.org/pub/repos/apt/ `lsb_release -cs`-pgdg main" |tee /etc/apt/sources.list.d/pgdg.list && \ |
57 | apt-get update && \ | 58 | apt-get update && \ |
58 | - apt-get install -y morfeusz2 python3-morfeusz2 && \ | 59 | + apt-get install -y morfeusz2 python3-morfeusz2 libopenblas-dev libomp-dev && \ |
59 | DEBIAN_FRONTEND="noninteractive" apt-get -y install postgresql-12 && \ | 60 | DEBIAN_FRONTEND="noninteractive" apt-get -y install postgresql-12 && \ |
61 | + pip3 install --upgrade pip && \ | ||
60 | pip3 install --no-cache /wheels/* | 62 | pip3 install --no-cache /wheels/* |
61 | 63 | ||
62 | # copy project | 64 | # copy project |
@@ -66,25 +68,35 @@ COPY . $APP_HOME | @@ -66,25 +68,35 @@ COPY . $APP_HOME | ||
66 | WORKDIR ./tools/liner2/g419-external-dependencies | 68 | WORKDIR ./tools/liner2/g419-external-dependencies |
67 | RUN tar -xvf ./CRF++-0.57.tar.gz | 69 | RUN tar -xvf ./CRF++-0.57.tar.gz |
68 | WORKDIR ./CRF++-0.57 | 70 | WORKDIR ./CRF++-0.57 |
69 | -RUN ./configure && \ | ||
70 | - make && \ | ||
71 | - make install && \ | ||
72 | - make clean && \ | 71 | +RUN ./configure && \ |
72 | + make && \ | ||
73 | + make install && \ | ||
74 | + make clean && \ | ||
73 | ldconfig | 75 | ldconfig |
76 | + | ||
77 | +# install eurobert | ||
78 | +WORKDIR $APP_HOME/tools | ||
79 | +RUN wget https://manage.legis.nlp.ipipan.waw.pl/download/marcell/eurobert-model.tar.gz && \ | ||
80 | + tar -xvf ./eurobert-model.tar.gz && \ | ||
81 | + rm ./eurobert-model.tar.gz | ||
74 | WORKDIR $APP_HOME | 82 | WORKDIR $APP_HOME |
75 | 83 | ||
76 | # copy django settings | 84 | # copy django settings |
77 | COPY ./collector/collector/docker-settings.py $APP_HOME/collector/collector/settings.py | 85 | COPY ./collector/collector/docker-settings.py $APP_HOME/collector/collector/settings.py |
78 | 86 | ||
79 | # chown all the files to the collector user | 87 | # chown all the files to the collector user |
80 | -RUN chown -R collector:collector $APP_HOME | 88 | +RUN chown -R collector:collector $HOME |
81 | 89 | ||
82 | # configure and init database | 90 | # configure and init database |
83 | USER postgres | 91 | USER postgres |
84 | RUN /etc/init.d/postgresql start && \ | 92 | RUN /etc/init.d/postgresql start && \ |
85 | psql --command "CREATE USER collector WITH SUPERUSER PASSWORD 'collector';" && \ | 93 | psql --command "CREATE USER collector WITH SUPERUSER PASSWORD 'collector';" && \ |
86 | createdb -O collector collector && \ | 94 | createdb -O collector collector && \ |
87 | - psql collector < $APP_HOME/resources/db/marcell-init.db | 95 | + wget https://manage.legis.nlp.ipipan.waw.pl/download/marcell/marcell-init.db.tar.gz && \ |
96 | + tar -xvf ./marcell-init.db.tar.gz && \ | ||
97 | + rm ./marcell-init.db.tar.gz && \ | ||
98 | + psql collector < ./marcell-init.db && \ | ||
99 | + rm ./marcell-init.db | ||
88 | 100 | ||
89 | # change to the root user | 101 | # change to the root user |
90 | USER 0 | 102 | USER 0 |
README.md
@@ -2,13 +2,22 @@ | @@ -2,13 +2,22 @@ | ||
2 | 2 | ||
3 | ## Running MARCELL annotate API using Docker | 3 | ## Running MARCELL annotate API using Docker |
4 | 4 | ||
5 | +#### Using docker-compose | ||
6 | + | ||
5 | ```sh | 7 | ```sh |
6 | docker-compose up -d | 8 | docker-compose up -d |
7 | ``` | 9 | ``` |
8 | 10 | ||
9 | -### Exemplary usage in Python | 11 | +#### Using docker |
10 | 12 | ||
11 | ```sh | 13 | ```sh |
14 | +docker build -t "marcell-pl" -f Dockerfile.marcell . | ||
15 | +docker run --name "marcell-pl-running" -p 8006:8000 -d marcell-pl | ||
16 | +``` | ||
17 | + | ||
18 | +### Exemplary usage in Python | ||
19 | + | ||
20 | +``` | ||
12 | import requests | 21 | import requests |
13 | 22 | ||
14 | url = 'http://<container_url>:<exposed_port>/annotate' | 23 | url = 'http://<container_url>:<exposed_port>/annotate' |
@@ -20,4 +29,3 @@ with open('/text/file/path.[txt/html/pdf]', 'rb') as text_file, open('/meta/fil | @@ -20,4 +29,3 @@ with open('/text/file/path.[txt/html/pdf]', 'rb') as text_file, open('/meta/fil | ||
20 | ## Conda based installation | 29 | ## Conda based installation |
21 | 30 | ||
22 | See INSTALL.md for installation instructions. | 31 | See INSTALL.md for installation instructions. |
23 | - |
collector/collector/docker-settings.py
@@ -68,6 +68,9 @@ SOLR_URL = 'http://localhost:8983/solr/' | @@ -68,6 +68,9 @@ SOLR_URL = 'http://localhost:8983/solr/' | ||
68 | # Language-agnostic BERT sentence embedding model path | 68 | # Language-agnostic BERT sentence embedding model path |
69 | LABSE_MODEL_PATH = os.path.join(TOOLS_DIR, 'labse', 'labse_bert_model') | 69 | LABSE_MODEL_PATH = os.path.join(TOOLS_DIR, 'labse', 'labse_bert_model') |
70 | 70 | ||
71 | +# expose REST API | ||
72 | +EXPOSE_API = True | ||
73 | + | ||
71 | # Quick-start development settings - unsuitable for production | 74 | # Quick-start development settings - unsuitable for production |
72 | # See https://docs.djangoproject.com/en/2.1/howto/deployment/checklist/ | 75 | # See https://docs.djangoproject.com/en/2.1/howto/deployment/checklist/ |
73 | 76 |
collector/pipeline/urls.py
1 | from django.urls import path | 1 | from django.urls import path |
2 | 2 | ||
3 | from . import views | 3 | from . import views |
4 | +from collector import settings | ||
4 | 5 | ||
5 | -urlpatterns = [ | ||
6 | - path('annotate', views.AnnotateView.as_view(), name='annotate'), | ||
7 | -] | 6 | + |
7 | +urlpatterns = [] | ||
8 | +if settings.EXPOSE_API: | ||
9 | + urlpatterns = [ | ||
10 | + path('annotate', views.AnnotateView.as_view(), name='annotate'), | ||
11 | + ] |
collector/projects/marcell/utils.py
@@ -100,6 +100,6 @@ def vectorize_eurovocs(): | @@ -100,6 +100,6 @@ def vectorize_eurovocs(): | ||
100 | labse = Labse(settings.LABSE_MODEL_PATH) | 100 | labse = Labse(settings.LABSE_MODEL_PATH) |
101 | for eurovoc_label in EuroVocLabel.objects.filter(vector__isnull=True): | 101 | for eurovoc_label in EuroVocLabel.objects.filter(vector__isnull=True): |
102 | print('Creating EuroVoc vector representation for "{}".'.format(eurovoc_label.text)) | 102 | print('Creating EuroVoc vector representation for "{}".'.format(eurovoc_label.text)) |
103 | - laser_vector = [float(x) for x in list(labse.embed(eurovoc_label.text))] | ||
104 | - eurovoc_label.vector = laser_vector | 103 | + labse_vector = [float(x) for x in list(labse.embed(eurovoc_label.text))] |
104 | + eurovoc_label.vector = labse_vector | ||
105 | eurovoc_label.save() | 105 | eurovoc_label.save() |
collector/terminology/eurovoc.py
@@ -20,10 +20,13 @@ def annotate(documents): | @@ -20,10 +20,13 @@ def annotate(documents): | ||
20 | 20 | ||
21 | def _add_document_level_domains(document, title_based_tld_preditor, min_score, k): | 21 | def _add_document_level_domains(document, title_based_tld_preditor, min_score, k): |
22 | print('Adding EuroVoc domains to: {}.'.format(document.id)) | 22 | print('Adding EuroVoc domains to: {}.'.format(document.id)) |
23 | + eurovoc_domains = [] | ||
24 | + tld_score_value = 'sim' | ||
25 | + | ||
23 | if document.keywords.exists(): | 26 | if document.keywords.exists(): |
24 | - tld_score_value = 'sim' | ||
25 | eurovoc_domains = _get_keyword_based_domains(document.keywords.all(), min_score, k) | 27 | eurovoc_domains = _get_keyword_based_domains(document.keywords.all(), min_score, k) |
26 | - else: | 28 | + |
29 | + if not eurovoc_domains: | ||
27 | tld_score_value = 'prob' | 30 | tld_score_value = 'prob' |
28 | eurovoc_domains = _get_title_based_domains(document.title, title_based_tld_preditor, min_score, k) | 31 | eurovoc_domains = _get_title_based_domains(document.title, title_based_tld_preditor, min_score, k) |
29 | 32 |
entrypoint.sh
@@ -15,6 +15,8 @@ python3 collector/manage.py makemigrations --noinput | @@ -15,6 +15,8 @@ python3 collector/manage.py makemigrations --noinput | ||
15 | python3 collector/manage.py migrate | 15 | python3 collector/manage.py migrate |
16 | python3 collector/manage.py configure_marcell_pipelines | 16 | python3 collector/manage.py configure_marcell_pipelines |
17 | python3 collector/manage.py load_eurovoc_terms -i resources/eurovoc -l pl | 17 | python3 collector/manage.py load_eurovoc_terms -i resources/eurovoc -l pl |
18 | +python3 collector/manage.py load_eurovoc_terms -i resources/eurovoc -l en | ||
18 | python3 collector/manage.py load_iate_terms -i resources/iate/iate.tbx | 19 | python3 collector/manage.py load_iate_terms -i resources/iate/iate.tbx |
20 | +python3 collector/manage.py map_eurovoc_terms | ||
19 | 21 | ||
20 | exec "$@" | 22 | exec "$@" |
requirements.txt
@@ -6,15 +6,14 @@ Django==2.2.5 | @@ -6,15 +6,14 @@ Django==2.2.5 | ||
6 | faiss | 6 | faiss |
7 | gunicorn==20.0.4 | 7 | gunicorn==20.0.4 |
8 | Keras==2.2.5 | 8 | Keras==2.2.5 |
9 | -laserembeddings[en,pl] | ||
10 | natsort | 9 | natsort |
11 | psycopg2-binary | 10 | psycopg2-binary |
12 | pysolr | 11 | pysolr |
13 | python-dateutil | 12 | python-dateutil |
14 | -pytorch | ||
15 | scikit-learn==0.22.1 | 13 | scikit-learn==0.22.1 |
16 | scrapy | 14 | scrapy |
17 | -tensorflow==1.14 | 15 | +tensorflow>=1.15,<2 |
18 | tika==1.19 | 16 | tika==1.19 |
19 | -xmldiff | ||
20 | - | 17 | +torch |
18 | +transformers>=3.0.2 | ||
19 | +xmldiff | ||
21 | \ No newline at end of file | 20 | \ No newline at end of file |