From 36b10b0f9b105beb4c2f00de3fab5955230fe12c Mon Sep 17 00:00:00 2001 From: BariscanTosyali Date: Mon, 16 Mar 2026 17:28:50 +0300 Subject: [PATCH] =?UTF-8?q?Veri=20haz=C4=B1rlama=20ad=C4=B1m=C4=B1=20tamam?= =?UTF-8?q?land=C4=B1,=20get=5Fdata=20fonksiyonu=20eklendi?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_preparation.ipynb | 637 +++++++++++++++++++++++++++++++++++++++-- olist/__init__.py | 0 olist/data.py | 34 +++ tests/get_data.pickle | Bin 0 -> 437 bytes 4 files changed, 644 insertions(+), 27 deletions(-) create mode 100644 olist/__init__.py create mode 100644 olist/data.py create mode 100644 tests/get_data.pickle diff --git a/data_preparation.ipynb b/data_preparation.ipynb index d583338..0e86f3b 100644 --- a/data_preparation.ipynb +++ b/data_preparation.ipynb @@ -9,8 +9,129 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 11, + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-16T14:25:19.428406Z", + "iopub.status.busy": "2026-03-16T14:25:19.427327Z", + "iopub.status.idle": "2026-03-16T14:25:21.207443Z", + "shell.execute_reply": "2026-03-16T14:25:21.206008Z", + "shell.execute_reply.started": "2026-03-16T14:25:19.428366Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
seller_idseller_zip_code_prefixseller_cityseller_state
03442f8959a84dea7ee197c632cb2df1513023campinasSP
1d1b65fc7debc3361ea86b5f14c68d2e213844mogi guacuSP
2ce3ad9de960102d0677a81f5d0bb7b2d20031rio de janeiroRJ
3c0f3eea2e14555b6faeea3dd58c1b1c34195sao pauloSP
451a04a8a6bdcb23deccc82b0b80742cf12914braganca paulistaSP
\n", + "
" + ], + "text/plain": [ + " seller_id seller_zip_code_prefix \\\n", + "0 3442f8959a84dea7ee197c632cb2df15 13023 \n", + "1 d1b65fc7debc3361ea86b5f14c68d2e2 13844 \n", + "2 ce3ad9de960102d0677a81f5d0bb7b2d 20031 \n", + "3 c0f3eea2e14555b6faeea3dd58c1b1c3 4195 \n", + "4 51a04a8a6bdcb23deccc82b0b80742cf 12914 \n", + "\n", + " seller_city seller_state \n", + "0 campinas SP \n", + "1 mogi guacu SP \n", + "2 rio de janeiro RJ \n", + "3 sao paulo SP \n", + "4 braganca paulista SP " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import sys\n", + "import os\n", + "\n", + "\n", + "sys.path.insert(0, os.path.abspath(os.getcwd()))\n", + "\n", + "from olist.data import Olist\n", + "data = Olist().get_data()\n", + "data['sellers'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-16T14:22:40.644246Z", + "iopub.status.busy": "2026-03-16T14:22:40.643244Z", + "iopub.status.idle": "2026-03-16T14:22:40.666105Z", + "shell.execute_reply": "2026-03-16T14:22:40.664962Z", + "shell.execute_reply.started": "2026-03-16T14:22:40.644203Z" + } + }, "outputs": [], "source": [ "# \"magic commands\" to enable autoreload of your imported packages\n", @@ -54,9 +175,28 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-16T14:22:43.795377Z", + "iopub.status.busy": "2026-03-16T14:22:43.794965Z", + "iopub.status.idle": "2026-03-16T14:22:43.816054Z", + "shell.execute_reply": "2026-03-16T14:22:43.814867Z", + "shell.execute_reply.started": "2026-03-16T14:22:43.795347Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/home/bariscan/.workintech/olist/data/csv')" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from pathlib import Path\n", "csv_path = Path(\"~/.workintech/olist/data/csv\").expanduser()\n", @@ -76,9 +216,36 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-16T14:22:46.707169Z", + "iopub.status.busy": "2026-03-16T14:22:46.706293Z", + "iopub.status.idle": "2026-03-16T14:22:46.724388Z", + "shell.execute_reply": "2026-03-16T14:22:46.722988Z", + "shell.execute_reply.started": "2026-03-16T14:22:46.707131Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[PosixPath('/home/bariscan/.workintech/olist/data/csv/product_category_name_translation.csv'),\n", + " PosixPath('/home/bariscan/.workintech/olist/data/csv/olist_products_dataset.csv'),\n", + " PosixPath('/home/bariscan/.workintech/olist/data/csv/olist_customers_dataset.csv'),\n", + " PosixPath('/home/bariscan/.workintech/olist/data/csv/olist_order_payments_dataset.csv'),\n", + " PosixPath('/home/bariscan/.workintech/olist/data/csv/olist_geolocation_dataset.csv'),\n", + " PosixPath('/home/bariscan/.workintech/olist/data/csv/olist_order_items_dataset.csv'),\n", + " PosixPath('/home/bariscan/.workintech/olist/data/csv/olist_orders_dataset.csv'),\n", + " PosixPath('/home/bariscan/.workintech/olist/data/csv/olist_order_reviews_dataset.csv'),\n", + " PosixPath('/home/bariscan/.workintech/olist/data/csv/olist_sellers_dataset.csv')]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "file_paths = list(csv_path.iterdir())\n", "file_paths" @@ -86,9 +253,86 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-16T14:22:49.931986Z", + "iopub.status.busy": "2026-03-16T14:22:49.931219Z", + "iopub.status.idle": "2026-03-16T14:22:50.498004Z", + "shell.execute_reply": "2026-03-16T14:22:50.496472Z", + "shell.execute_reply.started": "2026-03-16T14:22:49.931947Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
product_category_nameproduct_category_name_english
0beleza_saudehealth_beauty
1informatica_acessorioscomputers_accessories
2automotivoauto
3cama_mesa_banhobed_bath_table
4moveis_decoracaofurniture_decor
\n", + "
" + ], + "text/plain": [ + " product_category_name product_category_name_english\n", + "0 beleza_saude health_beauty\n", + "1 informatica_acessorios computers_accessories\n", + "2 automotivo auto\n", + "3 cama_mesa_banho bed_bath_table\n", + "4 moveis_decoracao furniture_decor" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Kodunuzu aşağıda test edin. Dizindeki ilk csv dosyasını yüklemeyi deneyin\n", "import pandas as pd\n", @@ -108,15 +352,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { + "execution": { + "iopub.execute_input": "2026-03-16T14:22:53.721966Z", + "iopub.status.busy": "2026-03-16T14:22:53.720778Z", + "iopub.status.idle": "2026-03-16T14:22:53.742051Z", + "shell.execute_reply": "2026-03-16T14:22:53.740928Z", + "shell.execute_reply.started": "2026-03-16T14:22:53.721927Z" + }, "tags": [ "challengify" ] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['product_category_name_translation.csv',\n", + " 'olist_products_dataset.csv',\n", + " 'olist_customers_dataset.csv',\n", + " 'olist_order_payments_dataset.csv',\n", + " 'olist_geolocation_dataset.csv',\n", + " 'olist_order_items_dataset.csv',\n", + " 'olist_orders_dataset.csv',\n", + " 'olist_order_reviews_dataset.csv',\n", + " 'olist_sellers_dataset.csv']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# YOUR CODE HERE" + "file_names = [path.name for path in file_paths]\n", + "file_names" ] }, { @@ -140,15 +411,45 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { + "execution": { + "iopub.execute_input": "2026-03-16T14:22:56.687132Z", + "iopub.status.busy": "2026-03-16T14:22:56.686004Z", + "iopub.status.idle": "2026-03-16T14:22:56.703979Z", + "shell.execute_reply": "2026-03-16T14:22:56.702976Z", + "shell.execute_reply.started": "2026-03-16T14:22:56.687102Z" + }, "tags": [ "challengify" ] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['product_category_name_translation',\n", + " 'products',\n", + " 'customers',\n", + " 'order_payments',\n", + " 'geolocation',\n", + " 'order_items',\n", + " 'orders',\n", + " 'order_reviews',\n", + " 'sellers']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# YOUR CODE HERE" + "key_names = [\n", + " name.replace('olist_', '').replace('_dataset.csv', '').replace('.csv', '') \n", + " for name in file_names\n", + "]\n", + "key_names" ] }, { @@ -184,15 +485,151 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { + "execution": { + "iopub.execute_input": "2026-03-16T14:23:00.714105Z", + "iopub.status.busy": "2026-03-16T14:23:00.713593Z", + "iopub.status.idle": "2026-03-16T14:23:02.753272Z", + "shell.execute_reply": "2026-03-16T14:23:02.752039Z", + "shell.execute_reply.started": "2026-03-16T14:23:00.714071Z" + }, "tags": [ "challengify" ] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
order_idcustomer_idorder_statusorder_purchase_timestamporder_approved_atorder_delivered_carrier_dateorder_delivered_customer_dateorder_estimated_delivery_date
0e481f51cbdc54678b7cc49136f2d6af79ef432eb6251297304e76186b10a928ddelivered2017-10-02 10:56:332017-10-02 11:07:152017-10-04 19:55:002017-10-10 21:25:132017-10-18 00:00:00
153cdb2fc8bc7dce0b6741e2150273451b0830fb4747a6c6d20dea0b8c802d7efdelivered2018-07-24 20:41:372018-07-26 03:24:272018-07-26 14:31:002018-08-07 15:27:452018-08-13 00:00:00
247770eb9100c2d0c44946d9cf07ec65d41ce2a54c0b03bf3443c3d931a367089delivered2018-08-08 08:38:492018-08-08 08:55:232018-08-08 13:50:002018-08-17 18:06:292018-09-04 00:00:00
3949d5b44dbf5de918fe9c16f97b45f8af88197465ea7920adcdbec7375364d82delivered2017-11-18 19:28:062017-11-18 19:45:592017-11-22 13:39:592017-12-02 00:28:422017-12-15 00:00:00
4ad21c59c0840e6cb83a9ceb5573f81598ab97904e6daea8866dbdbc4fb7aad2cdelivered2018-02-13 21:18:392018-02-13 22:20:292018-02-14 19:46:342018-02-16 18:17:022018-02-26 00:00:00
\n", + "
" + ], + "text/plain": [ + " order_id customer_id \\\n", + "0 e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d \n", + "1 53cdb2fc8bc7dce0b6741e2150273451 b0830fb4747a6c6d20dea0b8c802d7ef \n", + "2 47770eb9100c2d0c44946d9cf07ec65d 41ce2a54c0b03bf3443c3d931a367089 \n", + "3 949d5b44dbf5de918fe9c16f97b45f8a f88197465ea7920adcdbec7375364d82 \n", + "4 ad21c59c0840e6cb83a9ceb5573f8159 8ab97904e6daea8866dbdbc4fb7aad2c \n", + "\n", + " order_status order_purchase_timestamp order_approved_at \\\n", + "0 delivered 2017-10-02 10:56:33 2017-10-02 11:07:15 \n", + "1 delivered 2018-07-24 20:41:37 2018-07-26 03:24:27 \n", + "2 delivered 2018-08-08 08:38:49 2018-08-08 08:55:23 \n", + "3 delivered 2017-11-18 19:28:06 2017-11-18 19:45:59 \n", + "4 delivered 2018-02-13 21:18:39 2018-02-13 22:20:29 \n", + "\n", + " order_delivered_carrier_date order_delivered_customer_date \\\n", + "0 2017-10-04 19:55:00 2017-10-10 21:25:13 \n", + "1 2018-07-26 14:31:00 2018-08-07 15:27:45 \n", + "2 2018-08-08 13:50:00 2018-08-17 18:06:29 \n", + "3 2017-11-22 13:39:59 2017-12-02 00:28:42 \n", + "4 2018-02-14 19:46:34 2018-02-16 18:17:02 \n", + "\n", + " order_estimated_delivery_date \n", + "0 2017-10-18 00:00:00 \n", + "1 2018-08-13 00:00:00 \n", + "2 2018-09-04 00:00:00 \n", + "3 2017-12-15 00:00:00 \n", + "4 2018-02-26 00:00:00 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# YOUR CODE HERE" + "data = {key: pd.read_csv(path) for key, path in zip(key_names, file_paths)}\n", + "\n", + "# Kontrol etmek için herhangi bir tablonun ilk 5 satırına bakalım\n", + "data['orders'].head()" ] }, { @@ -210,9 +647,105 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 10, + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-16T14:23:58.746270Z", + "iopub.status.busy": "2026-03-16T14:23:58.745836Z", + "iopub.status.idle": "2026-03-16T14:24:00.715719Z", + "shell.execute_reply": "2026-03-16T14:24:00.714342Z", + "shell.execute_reply.started": "2026-03-16T14:23:58.746240Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
seller_idseller_zip_code_prefixseller_cityseller_state
03442f8959a84dea7ee197c632cb2df1513023campinasSP
1d1b65fc7debc3361ea86b5f14c68d2e213844mogi guacuSP
2ce3ad9de960102d0677a81f5d0bb7b2d20031rio de janeiroRJ
3c0f3eea2e14555b6faeea3dd58c1b1c34195sao pauloSP
451a04a8a6bdcb23deccc82b0b80742cf12914braganca paulistaSP
\n", + "
" + ], + "text/plain": [ + " seller_id seller_zip_code_prefix \\\n", + "0 3442f8959a84dea7ee197c632cb2df15 13023 \n", + "1 d1b65fc7debc3361ea86b5f14c68d2e2 13844 \n", + "2 ce3ad9de960102d0677a81f5d0bb7b2d 20031 \n", + "3 c0f3eea2e14555b6faeea3dd58c1b1c3 4195 \n", + "4 51a04a8a6bdcb23deccc82b0b80742cf 12914 \n", + "\n", + " seller_city seller_state \n", + "0 campinas SP \n", + "1 mogi guacu SP \n", + "2 rio de janeiro RJ \n", + "3 sao paulo SP \n", + "4 braganca paulista SP " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from olist.data import Olist\n", "Olist().get_data()['sellers'].head()" @@ -227,9 +760,47 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 8, + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-16T14:23:07.795316Z", + "iopub.status.busy": "2026-03-16T14:23:07.794497Z", + "iopub.status.idle": "2026-03-16T14:23:10.511655Z", + "shell.execute_reply": "2026-03-16T14:23:10.510455Z", + "shell.execute_reply.started": "2026-03-16T14:23:07.795273Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m============================= test session starts ==============================\u001b[0m\n", + "platform linux -- Python 3.12.9, pytest-8.3.4, pluggy-1.5.0 -- /home/bariscan/.pyenv/versions/3.12.9/envs/workintech/bin/python\n", + "cachedir: .pytest_cache\n", + "rootdir: /home/bariscan/data-data-preparation/tests\n", + "plugins: typeguard-4.4.2, anyio-4.8.0\n", + "\u001b[1mcollecting ... \u001b[0mcollected 3 items\n", + "\n", + "test_get_data.py::TestGetData::test_columns \u001b[32mPASSED\u001b[0m\u001b[32m [ 33%]\u001b[0m\n", + "test_get_data.py::TestGetData::test_keys \u001b[32mPASSED\u001b[0m\u001b[32m [ 66%]\u001b[0m\n", + "test_get_data.py::TestGetData::test_len \u001b[32mPASSED\u001b[0m\u001b[32m [100%]\u001b[0m\n", + "\n", + "\u001b[32m============================== \u001b[32m\u001b[1m3 passed\u001b[0m\u001b[32m in 0.02s\u001b[0m\u001b[32m ===============================\u001b[0m\n", + "\n", + "\n", + "💯 You can commit your code:\n", + "\n", + "\u001b[1;32mgit\u001b[39m add tests/get_data.pickle\n", + "\n", + "\u001b[32mgit\u001b[39m commit -m \u001b[33m'Completed get_data step'\u001b[39m\n", + "\n", + "\u001b[32mgit\u001b[39m push origin master\n", + "\n" + ] + } + ], "source": [ "from nbresult import ChallengeResult\n", "from olist.data import Olist\n", @@ -269,9 +840,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" } }, "nbformat": 4, diff --git a/olist/__init__.py b/olist/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/olist/data.py b/olist/data.py new file mode 100644 index 0000000..5380a45 --- /dev/null +++ b/olist/data.py @@ -0,0 +1,34 @@ +import pandas as pd +from pathlib import Path + +class Olist: + def __init__(self): + # Notebook'taki ile aynı yolu tanımlıyoruz + self.csv_path = Path("~/.workintech/olist/data/csv").expanduser() + + def get_data(self): + """ + 9 CSV dosyasını okur, isimlerini temizler ve bir dict içinde döndürür. + """ + if not self.csv_path.exists(): + raise FileNotFoundError(f"Veri yolu bulunamadı: {self.csv_path}") + + # Dosya yollarını listele + file_paths = list(self.csv_path.iterdir()) + + # Dosya isimlerini al + file_names = [path.name for path in file_paths if path.suffix == '.csv'] + + # Anahtar isimlerini temizle + key_names = [ + name.replace('olist_', '').replace('_dataset.csv', '').replace('.csv', '') + for name in file_names + ] + + # Sözlüğü oluştur (Notebook'ta yaptığımız mantıkla) + data = { + key: pd.read_csv(path) + for key, path in zip(key_names, file_paths) + } + + return data diff --git a/tests/get_data.pickle b/tests/get_data.pickle new file mode 100644 index 0000000000000000000000000000000000000000..00d22d8a49dc4db65245af42d3faa688a3b1fc2f GIT binary patch literal 437 zcmXX?yG{c!5ae+rcM1>D(9=P~Pax7DrJzTaV{a}NKHFN`A%}zny*rkl;49h7ovWSM z-I-Z`Rlk47M~7Exw?>HBCMs8tTWKx07G6f?hwi`WlkQ68WCv907Lw31Nh-6L*V-Vh zO8o|X6ud^yO{oSzue{Qz9Fy+=Bb7x9-g+gI@s5#44Jb^4j(e9Q3bOCOB_=LL0%2$1 zo$>5|xwsAJbyi8R5VRh9F{Ba+rHgjbuBRCupGV+h*dEGUd7C>o23w50P$u>L!gkYs&(^Gb%auPUBjUcsPbxM@i5C0-6uZjQw literal 0 HcmV?d00001