-
-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathtest.sh
More file actions
executable file
·135 lines (115 loc) · 4.15 KB
/
test.sh
File metadata and controls
executable file
·135 lines (115 loc) · 4.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/bin/bash
# This test assumes services are running locally:
# `docker compose --profile all up -d`
#
# It tests some of the most important services, but is by no means comprehensive.
# In particular, also at least check the frontpage in a browser (http://localhost:8000).
set -e
assert_contains() {
if echo "$1" | grep --ignore-case -q "$2"; then
echo "PASS: output contains '$2'"
else
echo "FAIL: output does not contain '$2'"
echo "Full output:"
echo "$1"
exit 1
fi
}
assert_url_exists() {
if curl --output /dev/null --silent --head --fail --location "$1"; then
echo "PASS: $1 exists"
else
echo "FAIL: $1 does not exist"
exit 1
fi
}
# nginx redirects request to the home page
HOME_PAGE=$(curl -s http://localhost:8000)
assert_contains "$HOME_PAGE" "OpenML is an open platform for sharing datasets"
DATASET_URL=http://localhost:8000/minio/datasets/0000/0020/dataset_37_diabetes.arff
DESCRIPTION_URL=http://localhost:8000/api/v1/json/data/20
# The JSON response may contain escaped slashes (e.g. http:\/\/), so strip them
DESCRIPTION=$(curl -s "$DESCRIPTION_URL" | sed 's/\\//g')
assert_contains "$DESCRIPTION" "diabetes"
wget "$DATASET_URL" -O dataset.arff
assert_contains "$(cat dataset.arff)" "@data"
rm dataset.arff
if [ -d .venv ]; then
echo "Using existing virtual environment for dataset upload."
else
echo "Creating virtual environment for dataset upload."
python -m venv .venv
source .venv/bin/activate
python -m pip install uv
uv pip install openml numpy
fi
echo "Attempting dataset upload"
DATA_ID=$(.venv/bin/python -c "
import numpy as np
import openml
from openml.datasets import create_dataset
openml.config.server = 'http://localhost:8000/api/v1/xml'
openml.config.apikey = 'normaluser'
data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T
attributes = [('col_' + str(i), 'REAL') for i in range(data.shape[1])]
dataset = create_dataset(
name='test-data',
description='Synthetic dataset created from a NumPy array',
creator='OpenML tester',
contributor=None,
collection_date='01-01-2018',
language='English',
licence='MIT',
default_target_attribute='col_' + str(data.shape[1] - 1),
row_id_attribute=None,
ignore_attribute=None,
citation='None',
attributes=attributes,
data=data,
version_label='test',
original_data_url='http://openml.github.io/openml-python',
paper_url='http://openml.github.io/openml-python',
)
dataset.publish()
print(dataset.id)
")
# Make sure DATA_ID is an integer, and not some Python error output
if ! echo "$DATA_ID" | grep -q '^[0-9]\+$'; then
echo "FAIL: DATA_ID is not an integer: '$DATA_ID'"
exit 1
fi
NEW_DATASET_URL=$(curl -s http://localhost:8000/api/v1/json/data/169 | jq -r ".data_set_description.url")
assert_url_exists "$NEW_DATASET_URL"
wget "$NEW_DATASET_URL" -O new_dataset.arff
assert_contains "$(cat new_dataset.arff)" "@data"
rm new_dataset.arff
# Wait for the dataset to become active, polling every 10 seconds for up to 2 minutes
WAITED=0
while [ "$WAITED" -lt 120 ]; do
DATASET_STATUS=$(curl -s "http://localhost:8000/api/v1/json/data/${DATA_ID}")
if echo "$DATASET_STATUS" | grep -q "active"; then
echo "PASS: dataset $DATA_ID is active (after ${WAITED}s)"
break
fi
echo "Waiting for dataset $DATA_ID to become active... (${WAITED}s elapsed)"
sleep 10
WAITED=$((WAITED + 10))
done
if [ "$WAITED" -ge 120 ]; then
echo "FAIL: dataset $DATA_ID did not become active within 120s"
echo "Full output:"
echo "$DATASET_STATUS"
exit 1
fi
echo "Checking parquet conversion"
PADDED_ID=$(printf "%04d" "$DATA_ID")
NEW_PARQUET_URL="http://localhost:8000/minio/datasets/0000/${PADDED_ID}/dataset_${DATA_ID}.pq"
wget "$NEW_PARQUET_URL"
DATA_SHAPE=$(.venv/bin/python -c "import pandas as pd; df = pd.read_parquet(\"dataset_${DATA_ID}.pq\"); print(df.shape)")
assert_contains "${DATA_SHAPE}" "(3, 4)"
rm "dataset_${DATA_ID}.pq"
CROISSANT_URL="http://localhost:8000/croissant/dataset/${DATA_ID}"
CROISSANT_NAME=$(curl -s ${CROISSANT_URL} | jq -r ".name")
assert_contains ${CROISSANT_NAME} "test-data"
ES_RESPONSE=$(curl -s "http://localhost:8000/es/data/_doc/${DATA_ID}")
assert_contains "${ES_RESPONSE}" "test-data"