diff --git a/Meadowlark-js/backends/meadowlark-kafka-stream/docker/docker-compose.yml b/Meadowlark-js/backends/meadowlark-kafka-stream/docker/docker-compose.yml index a311a87f..fe91b667 100644 --- a/Meadowlark-js/backends/meadowlark-kafka-stream/docker/docker-compose.yml +++ b/Meadowlark-js/backends/meadowlark-kafka-stream/docker/docker-compose.yml @@ -31,11 +31,10 @@ services: - ZOOKEEPER_CONNECT=zookeeper:2181 - KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://kafka1:9092 - - connect: - hostname: kafka-connect - container_name: kafka-connect - image: edfialliance/connect-meadowlark:2.3-1@sha256:6605d2f0ad1797ccf7e3f7a4dbe690bb0c9e198dd6a0d5720a7b170d0bc4ca95 + connect-source: + hostname: kafka-connect-source + container_name: kafka-connect-source + image: edfialliance/connect-meadowlark:2.3-2@sha256:e6792ab9c797a27ae15e2fba68e874cf9898658f947ba0448182b5e269b1bbb8 ports: - 8083:8083 networks: @@ -49,6 +48,23 @@ services: - OFFSET_STORAGE_TOPIC=debezium_offset - STATUS_STORAGE_TOPIC=debezium_status + connect-sink: + hostname: kafka-connect-sink + container_name: kafka-connect-sink + image: edfialliance/connect-meadowlark:2.3-2@sha256:e6792ab9c797a27ae15e2fba68e874cf9898658f947ba0448182b5e269b1bbb8 + ports: + - 8084:8083 + networks: + - meadowlark-net + links: + - kafka + environment: + - BOOTSTRAP_SERVERS=kafka1:9092 + - GROUP_ID=1 + - CONFIG_STORAGE_TOPIC=debezium_config + - OFFSET_STORAGE_TOPIC=debezium_offset + - STATUS_STORAGE_TOPIC=debezium_status + # Kafka Web UI - https://github.com/obsidiandynamics/kafdrop kafdrop: hostname: kafdrop diff --git a/Meadowlark-js/backends/meadowlark-kafka-stream/docker/readme.md b/Meadowlark-js/backends/meadowlark-kafka-stream/docker/readme.md index e7dcc6d1..7e86dbed 100644 --- a/Meadowlark-js/backends/meadowlark-kafka-stream/docker/readme.md +++ b/Meadowlark-js/backends/meadowlark-kafka-stream/docker/readme.md @@ -3,7 +3,7 @@ To setup with Debezium and connect to MongoDB and OpenSearch, run the `docker compose up -d`. Then execute the following steps: -## Configure Debezium +## Configure Debezium (Source) The Debezium Kafka Connector must be configured with the MongoDB admin username and password to listen to MongoDB change stream. To do this, copy the `debezium-mongodb.json.example` file to `debezium-mongodb.json`. Edit the json file and insert @@ -23,7 +23,21 @@ Invoke-RestMethod -Method Post -InFile .\debezium-mongodb.json ` -uri http://localhost:8083/connectors/ -ContentType "application/json" ``` -## Send Kafka Events to OpenSearch +### Verify source configuration + +To check that source connector is running, execute: + +```bash +curl http://localhost:8083/connector-plugins | jq . +``` + +```pwsh +Invoke-RestMethod http://localhost:8083/connector-plugins | ConvertTo-Json | ConvertFrom-Json +``` + +This returns the debezium connector information. + +## Send Kafka Events to OpenSearch (Sink) The Debezium Kafka Connector must be configured with the OpenSearch admin username and password to send the data streams to opensearch. To do this, copy the `opensearch_sink.json.example` file to `opensearch_sink.json`. Edit the json file and insert the connection username and password. Then send the configuration to the Debezium Kafka Connector: @@ -32,29 +46,29 @@ Linux: ```bash curl -i -X POST -H "Accept:application/json" -H "Content-Type:application/json" \ - http://localhost:8083/connectors/ -d @opensearch_sink.json + http://localhost:8084/connectors/ -d @opensearch_sink.json ``` Windows: ```pwsh Invoke-RestMethod -Method Post -InFile .\opensearch_sink.json ` - -uri http://localhost:8083/connectors/ -ContentType "application/json" + -uri http://localhost:8084/connectors/ -ContentType "application/json" ``` -### Verify configuration +### Verify sink configuration -To check that connectors are running, execute: +To check that sink connector is running, execute: ```bash -curl http://localhost:8083/connector-plugins | jq . +curl http://localhost:8084/connector-plugins | jq . ``` ```pwsh -Invoke-RestMethod http://localhost:8083/connector-plugins | ConvertTo-Json | ConvertFrom-Json +Invoke-RestMethod http://localhost:8084/connector-plugins | ConvertTo-Json | ConvertFrom-Json ``` -This returns the debezium connectors and the OpenSearch connector information. +This returns the OpenSearch connector information. ### Browsing Kafka Topics and Messages diff --git a/docs/performance-testing/RND-604-Raw-Results.txt b/docs/performance-testing/RND-604-Raw-Results.txt new file mode 100644 index 00000000..88ae440b --- /dev/null +++ b/docs/performance-testing/RND-604-Raw-Results.txt @@ -0,0 +1,30 @@ + + +-- Before splitting functionalities of kafka connect: + +Invoke-RestMethod -Method Post -InFile .\debezium-mongodb.json -uri http://localhost:8083/connectors/ -ContentType "application/json" +Invoke-RestMethod -Method Post -InFile .\opensearch_sink.json -uri http://localhost:8083/connectors/ -ContentType "application/json" + +--- Time: +00:02:50.4047199 +00:02:32.3251508 +00:02:34.7602723 +00:02:35.2213248 +00:02:45.5291286 + +AVG: 2:39:647 + + +-- After splitting functionalities of kafka connect: + +Invoke-RestMethod -Method Post -InFile .\debezium-mongodb.json -uri http://localhost:8083/connectors/ -ContentType "application/json" +Invoke-RestMethod -Method Post -InFile .\opensearch_sink.json -uri http://localhost:8084/connectors/ -ContentType "application/json" + +--- Time: +00:02:32.0613715 +00:02:32.9250768 +00:02:19.7885605 +00:02:42.5871903 +00:02:39.0919751 + +AVG: 2:33:290 diff --git a/docs/performance-testing/two-functionalities-of-kafka-connect-separated.md b/docs/performance-testing/two-functionalities-of-kafka-connect-separated.md new file mode 100644 index 00000000..1e727655 --- /dev/null +++ b/docs/performance-testing/two-functionalities-of-kafka-connect-separated.md @@ -0,0 +1,91 @@ +# RND-604: Separate kafka connect containers to separate sources from sink + +## Goal + +The debezium connect image has a list of sources to retrieve data from MongoDB, PostgreSQL +and other databases. The built image is adding a sink to write data to other sources, +and this can affect performance since the connector is both reading and writing data. + +Evaluate if using two separate containers improves performance by having one container +that is only the Debezium connectors, and another container that is built on top of the +kafka connect image and adds the opensearch and elasticsearch connectors. + +## Methodology + +1. Before splitting the container functionality, start Meadowlark fully in Docker, +using MongoDB as the backend and OpenSearch as the search provider. + + ```pwsh + cd Meadowlark-js + ./reset-docker-compose.ps1 + ``` + +2. Bulk upload the "partial grand bend" data set, capturing the time taken. + + ```pwsh + cd ../eng/performance + .\BulkLoad-Performance.ps1 -Template "PartialGrandBend" + ``` + +3. Repeat for a total of 5 measurements with the same settings +4. Repeat the measurement process. +5. Start everything over, do the 5 measurements after splitting the container functionality. + +## Environment + +The bulk load client runs on the host machine. It has 16 GB of RAM, +Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz 2.59 GHz processor, 6 cores and +12 logical processors, using WSL2. Docker has been configured to use 8GB of RAM +and 10 cores. + +Baseline `.env` configuration file: + +```none +OAUTH_SIGNING_KEY= +OWN_OAUTH_CLIENT_ID_FOR_CLIENT_AUTH=meadowlark_verify-only_key_1 +OWN_OAUTH_CLIENT_SECRET_FOR_CLIENT_AUTH=meadowlark_verify-only_secret_1 +OAUTH_SERVER_ENDPOINT_FOR_OWN_TOKEN_REQUEST=http://localhost:3000/local/oauth/token +OAUTH_SERVER_ENDPOINT_FOR_TOKEN_VERIFICATION=http://localhost:3000/local/oauth/verify +OAUTH_HARD_CODED_CREDENTIALS_ENABLED=true + +OPENSEARCH_USERNAME=admin +OPENSEARCH_PASSWORD=admin +OPENSEARCH_ENDPOINT=http://opensearch-ml-local-node1:9200 +OPENSEARCH_REQUEST_TIMEOUT=10000 + +AUTHORIZATION_STORE_PLUGIN=@edfi/meadowlark-mongodb-backend +DOCUMENT_STORE_PLUGIN=@edfi/meadowlark-mongodb-backend +QUERY_HANDLER_PLUGIN=@edfi/meadowlark-opensearch-backend +LISTENER1_PLUGIN=@edfi/meadowlark-opensearch-backend + +MONGODB_USER=mongo +MONGODB_PASS= +MONGO_URI=mongodb://${MONGODB_USER}:${MONGODB_PASS}@mongo1:27017,mongo2:27018,mongo3:27019/?replicaSet=rs0&maxPoolSize=100 + +FASTIFY_RATE_LIMIT=false +FASTIFY_PORT=3000 + +FASTIFY_NUM_THREADS=4 + +MEADOWLARK_STAGE=local +LOG_LEVEL=warn +IS_LOCAL=true + +BEGIN_ALLOWED_SCHOOL_YEAR=2022 +END_ALLOWED_SCHOOL_YEAR=2035 +ALLOW_TYPE_COERCION=true +ALLOW__EXT_PROPERTY=true + +SAVE_LOG_TO_FILE=true +LOG_FILE_LOCATION=c:/temp/ +``` + +## Results + +| Scenario | Avg | +| ------------------------------------------------- | -------- | +| One container for source and sink | 2:39:647 | +| One container for source and another one for sink | 2:33:290 | + +Given the results we got, there is a tiny improvement in performance when executed +the bulk load with 2 containers. Given these results we have decided to keep the code.