diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..f84a5f5e --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "python.analysis.extraPaths": [ + "./miniDB" + ] +} \ No newline at end of file diff --git a/mdb.py b/mdb.py index a981e5be..3ba99ae8 100644 --- a/mdb.py +++ b/mdb.py @@ -41,7 +41,6 @@ def in_paren(qsplit, ind): def create_query_plan(query, keywords, action): ''' Given a query, the set of keywords that we expect to pe present and the overall action, return the query plan for this query. - This can and will be used recursively ''' @@ -95,16 +94,39 @@ def create_query_plan(query, keywords, action): if action=='create table': args = dic['create table'][dic['create table'].index('('):dic['create table'].index(')')+1] + #print("\n") dic['create table'] = dic['create table'].removesuffix(args).strip() + + # 4 primary key arg_nopk = args.replace('primary key', '')[1:-1] arglist = [val.strip().split(' ') for val in arg_nopk.split(',')] - dic['column_names'] = ','.join([val[0] for val in arglist]) - dic['column_types'] = ','.join([val[1] for val in arglist]) + + # 4 unique columns + arg_nounique = args.replace('unique', '')[1:-1] + arglist1 = [val.strip().split(' ') for val in arg_nounique.split(',')] + + dic['column_names'] = ','.join([val[0] for val in arglist1]) + dic['column_types'] = ','.join([val[1] for val in arglist1]) + if 'primary key' in args: + #print("primary here") arglist = args[1:-1].split(' ') - dic['primary key'] = arglist[arglist.index('primary')-2] + + dic['primary key'] = arglist[arglist.index('primary')-2] # -2 tp find key's name, -1 to find key's data type e.g string/ integer else: dic['primary key'] = None + + # handle unique columns + if 'unique' in args: + arglist1 = args[1:-1].split(' ') + indx_lst = [idx for idx, value in enumerate(arglist1) if value == 'unique' or value == 'unique,'] + + dic['unique'] = ','.join(arglist1[n-2] for n in indx_lst) + else: + dic['unique'] = None + #print("\n") + #print(dic) + #print("\n") if action=='import': dic = {'import table' if key=='import' else key: val for key, val in dic.items()} @@ -175,7 +197,8 @@ def interpret(query): 'unlock table': ['unlock table', 'force'], 'delete from': ['delete from', 'where'], 'update table': ['update table', 'set', 'where'], - 'create index': ['create index', 'on', 'using'], + #'create index': ['create index', 'on', 'using'], + 'create index': ['create index', 'on', 'column', 'using'], 'drop index': ['drop index'], 'create view' : ['create view', 'as'] } @@ -205,9 +228,7 @@ def execute_dic(dic): def interpret_meta(command): """ Interpret meta commands. These commands are used to handle DB stuff, something that can not be easily handled with mSQL given the current architecture. - The available meta commands are: - lsdb - list databases lstb - list tables cdb - change/create database @@ -295,4 +316,4 @@ def remove_db(db_name): if isinstance(result,Table): result.show() except Exception: - print(traceback.format_exc()) + print(traceback.format_exc()) \ No newline at end of file diff --git a/miniDB/btree.py b/miniDB/btree.py index f0676209..de110826 100644 --- a/miniDB/btree.py +++ b/miniDB/btree.py @@ -15,7 +15,7 @@ def __init__(self, b, values=None, ptrs=None,left_sibling=None, right_sibling=No self.parent = parent # the index of a buckets parent self.is_leaf = is_leaf # a boolean value signaling whether the node is a leaf or not - + def find(self, value, return_ops=False): ''' Returns the index of the next node to search for a value if the node is not a leaf (a ptrs of the available ones). @@ -28,10 +28,20 @@ def find(self, value, return_ops=False): ops = 0 # number of operations (<>= etc). Used for benchmarking if self.is_leaf: # return - + + ''' + if (isinstance(value, int)): + value = float(value) + ''' + #print(value) # for each value in the node, if the user supplied value is smaller, return the btrees value index # else (no value in the node is larger) return the last ptr + #print(self.values) + #for index, existing_val in enumerate(self.values): + #print("existing val: ", existing_val) + #print("index: ", index) for index, existing_val in enumerate(self.values): + #print("existing val: ", existing_val) ops+=1 if value is None or existing_val is None: continue @@ -221,8 +231,6 @@ def split(self, node_id): self.split(node.parent) - - def show(self): ''' Show important info for each node (sort by level - root first, then left to right). @@ -288,9 +296,18 @@ def find(self, operator, value): operator: string. The provided evaluation operator. value: float. The value being searched for. ''' + + ''' + if (isinstance(value, int)): + value = float(value) + ''' results = [] + + # find the index of the node that the element should exist in leaf_idx, ops = self._search(value, True) + #print("leaf idx: ",leaf_idx) + #print("ops: ", ops) target_node = self.nodes[leaf_idx] if operator == '=': @@ -343,6 +360,7 @@ def find(self, operator, value): target_node = self.nodes[target_node.left_sibling] results.extend(target_node.ptrs) + # print the number of operations (usefull for benchamrking) # print(f'With BTree -> {ops} comparison operations') return results diff --git a/miniDB/database.py b/miniDB/database.py index a3ac6be7..e9c5e6df 100644 --- a/miniDB/database.py +++ b/miniDB/database.py @@ -1,5 +1,6 @@ from __future__ import annotations import pickle +import random from time import sleep, localtime, strftime import os,sys import logging @@ -14,7 +15,9 @@ from joins import Inlj, Smj from btree import Btree from misc import split_condition +from misc import split_not_condition from table import Table +from hash import Hash # readline.clear_history() @@ -54,7 +57,7 @@ def __init__(self, name, load=True, verbose = True): self.create_table('meta_length', 'table_name,no_of_rows', 'str,int') self.create_table('meta_locks', 'table_name,pid,mode', 'str,int,str') self.create_table('meta_insert_stack', 'table_name,indexes', 'str,list') - self.create_table('meta_indexes', 'table_name,index_name', 'str,str') + self.create_table('meta_indexes', 'table_name,table_column,index_name,index_type', 'str,str,str,str') self.save_database() def save_database(self): @@ -101,7 +104,7 @@ def _update(self): self._update_meta_insert_stack() - def create_table(self, name, column_names, column_types, primary_key=None, load=None): + def create_table(self, name, column_names, column_types, primary_key=None, unique=None, load=None): ''' This method create a new table. This table is saved and can be accessed via db_object.tables['table_name'] or db_object.table_name @@ -112,8 +115,9 @@ def create_table(self, name, column_names, column_types, primary_key=None, load= primary_key: string. The primary key (if it exists). load: boolean. Defines table object parameters as the name of the table and the column names. ''' + #print(primary_key) # print('here -> ', column_names.split(',')) - self.tables.update({name: Table(name=name, column_names=column_names.split(','), column_types=column_types.split(','), primary_key=primary_key, load=load)}) + self.tables.update({name: Table(name=name, column_names=column_names.split(','), column_types=column_types.split(','), primary_key=primary_key, unique=unique.split(',') if unique is not None else None, load=load)}) # self._name = Table(name=name, column_names=column_names, column_types=column_types, load=load) # check that new dynamic var doesnt exist already # self.no_of_tables += 1 @@ -257,7 +261,6 @@ def cast(self, column_name, table_name, cast_type): def insert_into(self, table_name, row_str): ''' Inserts data to given table. - Args: table_name: string. Name of table (must be part of database). row: list. A list of values to be inserted (will be casted to a predifined type automatically). @@ -292,8 +295,16 @@ def update_table(self, table_name, set_args, condition): set_column: string. The column to be altered. condition: string. A condition using the following format: 'column[<,<=,==,>=,>]value' or - 'value[<,<=,==,>=,>]column'. - + 'not column[<,<=,==,>=,>]value' or + 'value[<,<=,==,>=,>]column' or + + 'column[<,<=,==,>=,>]value and/or column[<,<=,==,>=,>]value and/or... ' or + 'column[<,<=,==,>=,>]value and/or not column[<,<=,==,>=,>]value and/or... ' or + 'not column[<,<=,==,>=,>]value and/or column[<,<=,==,>=,>]value and/or... ' or + 'not column[<,<=,==,>=,>]value and/or not column[<,<=,==,>=,>]value and/or ...' or + + 'column between value1 and value2' . + Operatores supported: (<,<=,==,>=,>) ''' set_column, set_value = set_args.replace(' ','').split('=') @@ -314,14 +325,22 @@ def delete_from(self, table_name, condition): table_name: string. Name of table (must be part of database). condition: string. A condition using the following format: 'column[<,<=,==,>=,>]value' or - 'value[<,<=,==,>=,>]column'. + 'not column[<,<=,==,>=,>]value' or + 'value[<,<=,==,>=,>]column' or + + 'column[<,<=,==,>=,>]value and/or column[<,<=,==,>=,>]value and/or... ' or + 'column[<,<=,==,>=,>]value and/or not column[<,<=,==,>=,>]value and/or... ' or + 'not column[<,<=,==,>=,>]value and/or column[<,<=,==,>=,>]value and/or... ' or + 'not column[<,<=,==,>=,>]value and/or not column[<,<=,==,>=,>]value and/or ...' or + + 'column between value1 and value2' . Operatores supported: (<,<=,==,>=,>) ''' self.load_database() - lock_ownership = self.lock_table(table_name, mode='x') deleted = self.tables[table_name]._delete_where(condition) + if lock_ownership: self.unlock_table(table_name) self._update() @@ -331,8 +350,10 @@ def delete_from(self, table_name, condition): self._add_to_insert_stack(table_name, deleted) self.save_database() - def select(self, columns, table_name, condition, distinct=None, order_by=None, \ - limit=True, desc=None, save_as=None, return_object=True): + + + def select (self, columns, table_name, condition, distinct=None, order_by=None, \ + limit=True, desc=None, save_as=None, return_object=True): ''' Selects and outputs a table's data where condtion is met. @@ -341,9 +362,20 @@ def select(self, columns, table_name, condition, distinct=None, order_by=None, \ columns: list. The columns that will be part of the output table (use '*' to select all available columns) condition: string. A condition using the following format: 'column[<,<=,==,>=,>]value' or - 'value[<,<=,==,>=,>]column'. + 'not column[<,<=,==,>=,>]value' or + 'value[<,<=,==,>=,>]column' or - Operatores supported: (<,<=,==,>=,>) + 'column[<,<=,==,>=,>]value and/or column[<,<=,==,>=,>]value and/or... ' or + 'column[<,<=,==,>=,>]value and/or not column[<,<=,==,>=,>]value and/or... ' or + 'not column[<,<=,==,>=,>]value and/or column[<,<=,==,>=,>]value and/or... ' or + 'not column[<,<=,==,>=,>]value and/or not column[<,<=,==,>=,>]value and/or ...' or + + 'column between value1 and value2 or not column[<,<=,==,>=,>]value or ... ' or + 'column between value1 and value2 or column[<,<=,==,>=,>]value or ... ' or + 'column[<,<=,==,>=,>]value or column between value1 and value2 or ... ' or + 'not column[<,<=,==,>=,>]value or column between value1 and value2 or ... ' . + + Operators supported: (<,<=,==,>=,>) order_by: string. A column name that signals that the resulting table should be ordered based on it (no order if None). desc: boolean. If True, order_by will return results in descending order (True by default). limit: int. An integer that defines the number of rows that will be returned (all rows if None). @@ -352,35 +384,141 @@ def select(self, columns, table_name, condition, distinct=None, order_by=None, \ distinct: boolean. If True, the resulting table will contain only unique rows. ''' - # print(table_name) self.load_database() if isinstance(table_name,Table): return table_name._select_where(columns, condition, distinct, order_by, desc, limit) + flag = 0 + statistics_OR = False if condition is not None: - condition_column = split_condition(condition)[0] - else: - condition_column = '' + print("Condition is: " + condition+"\n") + + operator = ' or ' + operator1 = ' and ' + operator3 = ' between ' + + # case: complex AND and OR conditions + if(operator in condition and operator1 in condition and operator3 not in condition): + + flag = 1 + if self.is_locked(table_name): + return + + table = self.tables[table_name]._select_where_and_or(columns, condition, distinct, order_by, desc, limit) + + # OR or AND in condition + elif (operator in condition or operator1 in condition and operator3 not in condition): + + flag = 1 # found + if self.is_locked(table_name): + return + + if (operator in condition): # OR in condition + + #print("OR here") + ''' + test for OR optimizations + - first check if all conditions refer to the same column name + ''' + splt = condition.split(operator) + condition_column_name = splt[0].split(' ')[0] + #print("Condition column name is: ",condition_column_name) + + flag1 = False + for i in range(len(splt)): + if 'not' not in splt[i]: + if splt[i].split(' ')[0] == condition_column_name: + flag1 = True + else: + flag1 = False + break + else: # NOT in condition + if splt[i].split(' ')[1] == condition_column_name: + flag1 = True + else: + flag1 = False + break + if flag1: + print("----OR optimizations-----") + print("All conditions refer to the same column!") + + if (self._has_index(table_name) and (condition_column_name in self.tables['meta_indexes'].column_by_name('table_column') and + 'not' not in splt)): + + statistics_OR = True + expression_list = [] + + print("BTree index has been found!") + + for s in splt: + s1 = s.split(condition_column_name) + expression_list.append(s1[1]) + + tuple1 = tuple(expression_list) + #print(tuple1) + condition1 = f'{condition_column_name}' " IN " f'{tuple1}' + print("New term is: ",condition1) + else: + print("Conditions do not refer to the same column!") + table = self.tables[table_name]._select_where_or(columns, condition, distinct, order_by, desc, limit) + else: # AND in condition + # Try to use Equivalence Transformation Rules: + k = random.randint(0, 1) # decide on k once + #print("random k is: ",k) - # self.lock_table(table_name, mode='x') - if self.is_locked(table_name): - return - if self._has_index(table_name) and condition_column==self.tables[table_name].column_names[self.tables[table_name].pk_idx]: - index_name = self.select('*', 'meta_indexes', f'table_name={table_name}', return_object=True).column_by_name('index_name')[0] - bt = self._load_idx(index_name) - table = self.tables[table_name]._select_where_with_btree(columns, bt, condition, distinct, order_by, desc, limit) - else: - table = self.tables[table_name]._select_where(columns, condition, distinct, order_by, desc, limit) + if k == 0: # use Equivalence Transformation Rules + table = self.tables[table_name].equivalence_transformation_rules(columns, condition, distinct, order_by, desc, limit) + else: # choose the default path -> function select_where_and + table = self.tables[table_name]._select_where_and(columns, condition, distinct, order_by, desc, limit) + + else: + if(condition[:4] == 'not '): # NOT in condition + condition_column = split_not_condition(condition)[0] + else: #NOT not in condition + condition_column = split_condition(condition)[0] + + else: # just a simple select * query + condition_column = '' + + if (flag == 0): # a simple select query + # self.lock_table(table_name, mode='x') + if self.is_locked(table_name): + return + + if (self._has_index(table_name) and (condition_column in self.tables['meta_indexes'].column_by_name('table_column') and + 'not ' not in condition)): + + index_name = self.select('*', 'meta_indexes', f'table_name = {table_name} and table_column = {condition_column}', return_object=True).column_by_name('index_name')[0] + index_name = ('').join(index_name) + + index_type = self.select('*', 'meta_indexes', f'table_name = {table_name} and table_column = {condition_column} and index_name = {index_name}', return_object=True).column_by_name('index_type') + index_type = ('').join(index_type) + + print("Index_name is: ",index_name) + print("Index_type is: ",index_type) + + if index_type == 'btree': + bt = self._load_idx(index_name) + table = self.tables[table_name]._select_where_with_btree(columns, bt, condition, distinct, order_by, desc, limit) + else: # Extendible hashing + h = self._load_idx(index_name) + table = self.tables[table_name]._select_where_with_hash(columns, h, condition, distinct, order_by, desc, limit) + + else: + #print("No select where with btree") + table = self.tables[table_name]._select_where(columns, condition, distinct, order_by, desc, limit) + # self.unlock_table(table_name) - if save_as is not None: - table._name = save_as - self.table_from_object(table) - else: - if return_object: - return table + if (statistics_OR == False): + if save_as is not None: + table._name = save_as + self.table_from_object(table) else: - return table.show() + if return_object: + return table + else: + return table.show() def show_table(self, table_name, no_of_rows=None): @@ -650,57 +788,103 @@ def _update_meta_insert_stack_for_tb(self, table_name, new_stack): # indexes - def create_index(self, index_name, table_name, index_type='btree'): + def create_index(self, index_name, table_name, column_name, index_type): ''' Creates an index on a specified table with a given name. - Important: An index can only be created on a primary key (the user does not specify the column). - + The index is created over a primary key or over a unique column + (the user has to specify the column). + Args: - table_name: string. Table name (must be part of database). index_name: string. Name of the created index. - ''' - if self.tables[table_name].pk_idx is None: # if no primary key, no index - raise Exception('Cannot create index. Table has no primary key.') - if index_name not in self.tables['meta_indexes'].column_by_name('index_name'): + table_name: string with the following format: + TableName 'column' columnName index_type + IMPORTANT: The TableName (must be part of database) + column_name: string. Name of the column where the index is created over (must be part of database). + + ''' + + if (column_name != None): # look 4 unique columns + #print("case: look 4 pk or 4 unique column") + #print(self.tables[table_name].pk) + if (column_name != self.tables[table_name].pk and column_name not in self.tables[table_name].unique): + raise Exception('Cannot create index. The column you specified is not unique or table has no primary key.') + + if index_name not in self.tables['meta_indexes'].column_by_name('index_name'): # currently only btree is supported. This can be changed by adding another if. - if index_type=='btree': - logging.info('Creating Btree index.') - # insert a record with the name of the index and the table on which it's created to the meta_indexes table - self.tables['meta_indexes']._insert([table_name, index_name]) - # crate the actual index - self._construct_index(table_name, index_name) - self.save_database() + if index_type == 'btree': # case1 : index btree + logging.info('Creating Btree index.') + print('Creating Btree index.') + # insert a record with the name of the index, the index type, the table and the table's column on which it's created to the meta_indexes table + self.tables['meta_indexes']._insert([table_name, column_name , index_name, index_type]) + # create the actual index + self._construct_index(table_name, column_name, index_name) + self.save_database() + + elif index_type == 'hashing': # case2 : index hash + logging.info('Creating hash index.') + print('Creating hash index.') + # insert a record with the name of the index, the index type, the table and the table's column on which it's created to the meta_indexes table + self.tables['meta_indexes']._insert([table_name, column_name, index_name, index_type]) + # crate the actual index + self._construct_hash_index(table_name, column_name, index_name) + self.save_database() + + else: + raise Exception('Cannot create index. Another index with the same name already exists.') else: - raise Exception('Cannot create index. Another index with the same name already exists.') + raise Exception('Cannot create index. You have to specify the column first.') + - def _construct_index(self, table_name, index_name): + def _construct_index(self, table_name, column_name, index_name): ''' Construct a btree on a table and save. Args: table_name: string. Table name (must be part of database). + column_name: string. Name of the table's column where the index is created over (must be part of database). index_name: string. Name of the created index. ''' bt = Btree(3) # 3 is arbitrary # for each record in the primary key of the table, insert its value and index to the btree - for idx, key in enumerate(self.tables[table_name].column_by_name(self.tables[table_name].pk)): + # for each record in the specified unique column of the table, insert its value and index to the btree + for idx, key in enumerate(self.tables[table_name].column_by_name(column_name)): if key is None: continue bt.insert(key, idx) # save the btree - self._save_index(index_name, bt) + self._save_index( index_name, bt) + + + def _construct_hash_index(self, table_name, column_name, index_name): + ''' + Construct extendible hashing on a table and save. + Args: + table_name: string. Table name (must be part of database). + column_name: string. Name of the table's column where the index is created over (must be part of database). + index_name: string. Name of the created index. + ''' + + h=Hash() + for idx, key in enumerate(self.tables[table_name].column_by_name(column_name)): + if key is None: + continue + h.insert(key, idx) + self._save_index(index_name,h) + def _has_index(self, table_name): ''' Check whether the specified table's primary key column is indexed. + Check whether the specified table's unique column is indexed. Args: table_name: string. Table name (must be part of database). ''' return table_name in self.tables['meta_indexes'].column_by_name('table_name') + def _save_index(self, index_name, index): ''' Save the index object. @@ -717,6 +901,7 @@ def _save_index(self, index_name, index): with open(f'{self.savedir}/indexes/meta_{index_name}_index.pkl', 'wb') as f: pickle.dump(index, f) + def _load_idx(self, index_name): ''' Load and return the specified index. @@ -729,6 +914,7 @@ def _load_idx(self, index_name): f.close() return index + def drop_index(self, index_name): ''' Drop index from current database. @@ -745,4 +931,10 @@ def drop_index(self, index_name): warnings.warn(f'"{self.savedir}/indexes/meta_{index_name}_index.pkl" not found.') self.save_database() - \ No newline at end of file + + + def handle_or_op(self, columns, table_name, s, distinct=None, order_by=None, \ + limit=True, desc=None, save_as=None, return_object=True): + + self.select(columns, table_name, s, distinct, order_by, desc, limit) + \ No newline at end of file diff --git a/miniDB/hash.py b/miniDB/hash.py new file mode 100644 index 00000000..e10c3a16 --- /dev/null +++ b/miniDB/hash.py @@ -0,0 +1,172 @@ +import math +class Hash: + def __init__(self): + self.capacity=3 #bucket capacity + self.global_depth=1 + bucket1=Bucket(bucket=[],ld=1) + bucket2=Bucket(bucket=[],ld=1) + self.data={'0':bucket1,'1':bucket2} + + + def get_hash_index(self,key): + ''' + Hash function.Returns the hashed value. + + key:The key used for placing the tuple in the correct bucket according to a hash function. + ''' + if type(key)==int: + h=key + elif type(key)==float: + h=math.ceil(key) + else: # type string + h=0 + for c in key: + h+=ord(c) # ascii number + size=2**(self.global_depth) + hash=h%(size) #returns number of lsb + hashed=int(bin(hash)[2:]) + hash_index=str(hashed) + if(self.global_depth>1): + if(len(hash_index)!=self.global_depth): + hash_index=hash_index.zfill(self.global_depth) #fills with zeros + return hash_index + + def insert(self,key,value): + ''' + Insert the key and its value(pointer) to the appropriate bucket. + Args: + key:The key used for placing the tuple in the correct bucket. + value: int. The ptr of the inserted value (e.g. its index). + ''' + hash_key = str(self.get_hash_index(key)) + found_key = False + + for record in enumerate(self.data): + record_key, record_val = record + if record_key == key and record_val==value: + found_key = True + break + if not found_key: + if len(self.data[hash_key].bucket)==self.capacity: #full bucket + self.split_bucket(key,hash_key,value) # call split function + else: # no split -> add tuple to bucket + self.data[hash_key].bucket.append((key,value)) + + + def split_bucket(self,key,hash_key,value): + ''' + Checks the global depth in relation to the local depth.If the global depth is equal to the local depth + then directory expansion,rehashing of the bucket and increment by one of the local and global depth occur.If the + local depth is less than the global depth then only rehashing of the bucket and increment by one of the local depth occur. + + Args: + key:The key used for placing the tuple in the correct bucket. + hash_key:str.The hashed key where the bucket overflow occurs. + value: int. The ptr of the inserted value (e.g. its index). + + ''' + list1=[] + for key1,value1 in(self.data[hash_key].bucket): + list1.append((key1,value1)) + + if((key,value)) not in list1: + list1.append((key,value)) + if self.global_depth==self.data[hash_key].ld: # global depth = local depth + self.global_depth+=1 + self.directory_expansion(hash_key) + self.rehashing(list1) + elif self.data[hash_key].ld': + if k > value: + rows.append(ind) + elif operator == '>=': + if k >= value: + rows.append(ind) + elif operator == '<': + if k < value: + rows.append(ind) + elif operator == '<=': + if k <= value: + rows.append(ind) + + #remove duplicates first + rows = list(dict.fromkeys(rows)) + return rows + + def show(self): + ''' + Print the whole dictionary (keys and values). + ''' + for item in self.data: + print("Key : {} , Value : {}".format(item,self.data[item].bucket)) + +class Bucket: + ''' + The bucket abstraction. + ''' + def __init__(self,bucket,ld): + self.bucket= [] if bucket is None else bucket + self.ld=ld + + + + \ No newline at end of file diff --git a/miniDB/misc.py b/miniDB/misc.py index aefada74..0efd9cae 100644 --- a/miniDB/misc.py +++ b/miniDB/misc.py @@ -1,14 +1,43 @@ import operator +def operator_between(value, condition): + begin, end = condition.split('and') + + begin = begin.strip() + end = end.strip() + + begin = begin.replace("'","") + end = end.replace("'","") + + if (begin.isnumeric() and end.isnumeric()): + begin = int(begin) + end = int(end) + value = int(value) + if (value >= begin) and (value <= end): + return True + + elif (not(begin.isnumeric() or end.isnumeric())): + if (value >= begin) and (value <= end): + return True + + else: + raise Exception("Values must be of the same type!") + + return False + + def get_op(op, a, b): ''' Get op as a function of a and b by using a symbol ''' - ops = {'>': operator.gt, + ops = { '>': operator.gt, '<': operator.lt, '>=': operator.ge, '<=': operator.le, - '=': operator.eq} + '=': operator.eq, + '<>': operator.ne, + 'between': operator_between + } try: return ops[op](a,b) @@ -16,17 +45,41 @@ def get_op(op, a, b): return False def split_condition(condition): + ops = {'>=': operator.ge, '<=': operator.le, '=': operator.eq, '>': operator.gt, - '<': operator.lt} + '<': operator.lt, + 'between': operator_between + } for op_key in ops.keys(): splt=condition.split(op_key) + + #print(splt) + if len(splt)>1: # operator has been found + left, right = splt[0].strip(), splt[1].strip() + + #print("split is: ") + #print(splt) if len(splt)>1: left, right = splt[0].strip(), splt[1].strip() + + if op_key == 'between': # between in condition + begin,end = right.split('and') + begin = begin.strip() + end = end.strip() + if (begin[0] == '"' == begin[-1]) or (end[0] == '"' == end[-1]): + begin = begin.strip('"',"") + end = end.strip('"',"") + elif ( ' ' in begin) or ( ' ' in end): + raise ValueError(f'Invalid condition: {condition}\nValue must be enclosed in double quotation marks to include whitespaces.') + if (begin.find('"') != -1) or (end.find('"') != -1): # If there are any double quotes in the value, throw. (Notice we've already removed the leading and trailing ones) + raise ValueError(f'Invalid condition: {condition}\nDouble quotation marks are not allowed inside values.') + return left, op_key, right + if right[0] == '"' == right[-1]: # If the value has leading and trailing quotes, remove them. right = right.strip('"') elif ' ' in right: # If it has whitespaces but no leading and trailing double quotes, throw. @@ -34,9 +87,9 @@ def split_condition(condition): if right.find('"') != -1: # If there are any double quotes in the value, throw. (Notice we've already removed the leading and trailing ones) raise ValueError(f'Invalid condition: {condition}\nDouble quotation marks are not allowed inside values.') - return left, op_key, right + def reverse_op(op): ''' Reverse the operator given @@ -48,3 +101,36 @@ def reverse_op(op): '<=' : '>=', '=' : '=' }.get(op) + + +def not_op(op): + ''' + Handle operator not, by changing the operator given + ''' + return { + '>' : '<=', + '>=' : '<', + '<' : '>=', + '<=' : '>', + '=' : '<>' + }.get(op) + + +def split_not_condition(condition): # not salary > 50000 + splt = condition.split(' ') + #print(splt) + op_key = not_op(splt[2]) + left, right = splt[1].strip(), splt[3].strip() + + if right[0] == '"' == right[-1]: # If the value has leading and trailing quotes, remove them. + right = right.strip('"') + elif ' ' in right: # If it has whitespaces but no leading and trailing double quotes, throw. + raise ValueError(f'Invalid condition: {condition}\nValue must be enclosed in double quotation marks to include whitespaces.') + + + if right.find('"') != -1: # If there are any double quotes in the value, throw. (Notice we've already removed the leading and trailing ones) + raise ValueError(f'Invalid condition: {condition}\nDouble quotation marks are not allowed inside values.') + + return left, op_key, right + + diff --git a/miniDB/table.py b/miniDB/table.py index f5c7d937..c497f29b 100644 --- a/miniDB/table.py +++ b/miniDB/table.py @@ -1,4 +1,6 @@ from __future__ import annotations +import itertools +import random from tabulate import tabulate import pickle import os @@ -6,7 +8,7 @@ sys.path.append(f'{os.path.dirname(os.path.dirname(os.path.abspath(__file__)))}/miniDB') -from misc import get_op, split_condition +from misc import get_op, split_condition, split_not_condition class Table: @@ -26,9 +28,11 @@ class Table: - a dictionary that includes the appropriate info (all the attributes in __init__) ''' - def __init__(self, name=None, column_names=None, column_types=None, primary_key=None, load=None): + def __init__(self, name=None, column_names=None, column_types=None, primary_key=None, unique=None, load=None): + if load is not None: + #print("here") # if load is a dict, replace the object dict with it (replaces the object with the specified one) if isinstance(load, dict): self.__dict__.update(load) @@ -40,15 +44,28 @@ def __init__(self, name=None, column_names=None, column_types=None, primary_key= # if name, columns_names and column types are not none elif (name is not None) and (column_names is not None) and (column_types is not None): + #print("here1") self._name = name if len(column_names)!=len(column_types): raise ValueError('Need same number of column names and types.') self.column_names = column_names + self.unique = unique self.columns = [] - + self.unique_cols_idx = [] + + ''' + for c in self.unique: + if c not in self.__dir__(): + # this is used in order to be able to call a column using its name as an attribute. + # example: instead of table.columns['column_name'], we do table.column_name + setattr(self, c, []) + self.unique.append([]) + else: + raise Exception(f'"{c}" attribute already exists in "{self.__class__.__name__} "class.') + ''' for col in self.column_names: if col not in self.__dir__(): # this is used in order to be able to call a column using its name as an attribute. @@ -66,8 +83,16 @@ def __init__(self, name=None, column_names=None, column_types=None, primary_key= self.pk_idx = self.column_names.index(primary_key) else: self.pk_idx = None - + + if unique is not None: + for c in unique: + self.unique_cols_idx.append(self.column_names.index(c)) + else: + self.unique_cols_idx = [] + self.pk = primary_key + self.unique = unique + # self._update() # if any of the name, columns_names and column types are none. return an empty table object @@ -146,26 +171,92 @@ def _update_rows(self, set_value, set_column, condition): set_column: string. The column to be altered. condition: string. A condition using the following format: 'column[<,<=,=,>=,>]value' or - 'value[<,<=,=,>=,>]column'. - + 'not column[<,<=,=,>=,>]value' or + 'value[<,<=,=,>=,>]column' or + + 'column[<,<=,==,>=,>]value and/or column[<,<=,==,>=,>]value and/or... ' or + 'column[<,<=,==,>=,>]value and/or not column[<,<=,==,>=,>]value and/or... ' or + 'not column[<,<=,==,>=,>]value and/or column[<,<=,==,>=,>]value and/or... ' or + 'not column[<,<=,==,>=,>]value and/or not column[<,<=,==,>=,>]value and/or ...' + 'column between value1 and value2' . + Operatores supported: (<,<=,=,>=,>) ''' - # parse the condition - column_name, operator, value = self._parse_condition(condition) - - # get the condition and the set column - column = self.column_by_name(column_name) - set_column_idx = self.column_names.index(set_column) - - # set_columns_indx = [self.column_names.index(set_column_name) for set_column_name in set_column_names] - # for each value in column, if condition, replace it with set_value - for row_ind, column_value in enumerate(column): - if get_op(operator, column_value, value): + operator = ' or ' + operator1 = ' and ' + operator3 = ' between ' + if (operator in condition and operator1 in condition and operator3 not in condition): + splt = condition.split(operator) + for s in splt: + self._update_rows(set_value, set_column, s) + + elif (operator in condition and operator3 not in condition): # or in condition + splt = condition.split(operator) + for s in splt: + # parse the condition + column_name, operator, value = self._parse_condition(s) + + # get the condition and the set column + column = self.column_by_name(column_name) + set_column_idx = self.column_names.index(set_column) + + # for each value in column, if condition, replace it with set_value + for row_ind, column_value in enumerate(column): + if get_op(operator, column_value, value): + self.data[row_ind][set_column_idx] = set_value + + elif (operator1 in condition and operator3 not in condition): # and in condition + rows = [] + rows1 = [] + splt = condition.split(operator1) + + column_name, operator, value = self._parse_condition(splt[0]) + # get the condition and the set column + column = self.column_by_name(column_name) + set_column_idx = self.column_names.index(set_column) + + # for each value in column, if condition, replace it with set_value + for row_ind, column_value in enumerate(column): + if get_op(operator, column_value, value): + rows.append(row_ind) + + for s in splt[1:]: + # parse the condition + column_name, operator, value = self._parse_condition(s) + # get the condition and the set column + column = self.column_by_name(column_name) + set_column_idx = self.column_names.index(set_column) + + # for each value in column, if condition, replace it with set_value + for row_ind, column_value in enumerate(column): + if get_op(operator, column_value, value): + rows1.append(row_ind) + rows = [c for c in rows if c in rows1] + #print(rows) + if len(rows) == 0: # no common element + break + + for row_ind in rows: self.data[row_ind][set_column_idx] = set_value - - # self._update() - # print(f"Updated {len(indexes_to_del)} rows") + + else: + # parse the condition + column_name, operator, value = self._parse_condition(condition) + + # get the condition and the set column + column = self.column_by_name(column_name) + set_column_idx = self.column_names.index(set_column) + + # set_columns_indx = [self.column_names.index(set_column_name) for set_column_name in set_column_names] + + # for each value in column, if condition, replace it with set_value + for row_ind, column_value in enumerate(column): + if get_op(operator, column_value, value): + self.data[row_ind][set_column_idx] = set_value + + # self._update() + # print(f"Updated {len(indexes_to_del)} rows") def _delete_where(self, condition): @@ -178,18 +269,70 @@ def _delete_where(self, condition): Args: condition: string. A condition using the following format: 'column[<,<=,==,>=,>]value' or - 'value[<,<=,==,>=,>]column'. + 'not column[<,<=,==,>=,>]value' or + 'value[<,<=,==,>=,>]column' or + 'column[<,<=,==,>=,>]value and/or column[<,<=,==,>=,>]value and/or... ' or + 'column[<,<=,==,>=,>]value and/or not column[<,<=,==,>=,>]value and/or... ' or + 'not column[<,<=,==,>=,>]value and/or column[<,<=,==,>=,>]value and/or... ' or + 'not column[<,<=,==,>=,>]value and/or not column[<,<=,==,>=,>]value and/or ...' or + + 'column between value1 and value2' . + Operatores supported: (<,<=,==,>=,>) ''' - column_name, operator, value = self._parse_condition(condition) + operator = ' or ' + operator1 = ' and ' + operator3 = ' between ' indexes_to_del = [] - - column = self.column_by_name(column_name) - for index, row_value in enumerate(column): - if get_op(operator, row_value, value): - indexes_to_del.append(index) + indexes_to_del1 = [] + + if (operator in condition and operator1 in condition and operator3 not in condition): # OR and AND in condition + print("complex AND and OR found!") + print(condition) + splt = condition.split(operator) + for s in splt: + self._delete_where(s) + #self._delete_where_and_or(condition) + + elif (operator in condition and operator3 not in condition): # OR in condition + splt = condition.split(operator) + for s in splt: + column_name, operator, value = self._parse_condition(s) + column = self.column_by_name(column_name) + for index, row_value in enumerate(column): + if get_op(operator, row_value, value): + indexes_to_del.append(index) + + elif(operator1 in condition and operator3 not in condition): # AND in condition + splt = condition.split(operator1) + #indexes_to_del1 = [] + column_name, operator, value = self._parse_condition(splt[0]) + column = self.column_by_name(column_name) + + for index, row_value in enumerate(column): + if get_op(operator, row_value, value): + indexes_to_del.append(index) + #print(indexes_to_del) + + for s in splt[1:]: + column_name, operator, value = self._parse_condition(s) + column = self.column_by_name(column_name) + for index, row_value in enumerate(column): + if get_op(operator, row_value, value): + indexes_to_del1.append(index) + #print(indexes_to_del1) + indexes_to_del = [c for c in indexes_to_del if c in indexes_to_del1] + #print(indexes_to_del) + if len(indexes_to_del) == 0: # no common element + break + else: # a simple delete query + column_name, operator, value = self._parse_condition(condition) + column = self.column_by_name(column_name) + for index, row_value in enumerate(column): + if get_op(operator, row_value, value): + indexes_to_del.append(index) # we pop from highest to lowest index in order to avoid removing the wrong item # since we dont delete, we dont have to to pop in that order, but since delete is used @@ -205,9 +348,104 @@ def _delete_where(self, condition): # self._update() # we have to return the deleted indexes, since they will be appended to the insert_stack return indexes_to_del + + ''' + def _delete_where_and_or(self, condition): + #t_indexes = [] + #operator1 = ' and ' + operator2 = ' or ' + splt = condition.split(operator2) + for s in splt: + self._delete_where(s) + #t_indexes.append(self._delete_where(s)) + #print(s) + #print(t_indexes) + #return t_indexes + ''' + def equivalence_transformation_rules(self, return_columns, condition=None, distinct=False, order_by=None, desc=True, limit=None, flag = False): + ''' + Relational Algebraic Equivalence Transformation Rules: + 1. σθ1 ^ σθ2 = σθ1(σθ2) + 2. σθ1(σθ2) = σθ2(σθ1) + ''' + print("Equivalence Transformation Rule: σθ1^σθ2 = σθ1(σθ2)") + + # if * return all columns, else find the column indexes for the columns specified + if return_columns == '*': + return_cols = [i for i in range(len(self.column_names))] + else: + return_cols = [self.column_names.index(col.strip()) for col in return_columns.split(',')] + + + splt = condition.split(' and ') + if (len(splt)!=0): # if there are any conditions on the left and on the right side of or operator + rows = [] + rows1 = [] + + if (len(splt) == 2): + + k = random.randint(0, 1) # decide on k once + #print("random k is: ",k) + + if k == 0: #reverse + print("Equivalence Transformation Rule: σθ1(σθ2)=σθ2(σθ1)") + temp = '' + temp = splt[-1] + splt[-1] = splt[0] + splt[0] = temp - def _select_where(self, return_columns, condition=None, distinct=False, order_by=None, desc=True, limit=None): + column_name, operator, value = self._parse_condition(splt[-1]) + column = self.column_by_name(column_name) + + for ind, x in enumerate(column): + if get_op(operator, x, value): + rows.append(ind) + + #print("Ιnitial rows are: ",rows) + for s in reversed(splt): + if s == splt[-1]: + continue + else: + column_name, operator, value = self._parse_condition(s) + column = self.column_by_name(column_name) + + for ind, x in enumerate(column): + if ind not in rows: # not in inner condition indexes + continue + else: + if get_op(operator, x, value): + rows1.append(ind) + + #print("Rows1 are: ",rows1) + rows = [c for c in rows if c in rows1] + if len(rows) == 0: # no common element + break + #print("Τotal rows are: ",rows) + + # copy the old dict, but only the rows and columns of data with index in rows/columns (the indexes that we want returned) + dict = {(key):([[self.data[i][j] for j in return_cols] for i in rows] if key=="data" else value) for key,value in self.__dict__.items()} + + # we need to set the new column names/types and no of columns, since we might + # only return some columns + dict['column_names'] = [self.column_names[i] for i in return_cols] + dict['column_types'] = [self.column_types[i] for i in return_cols] + + s_table = Table(load=dict) + s_table.data = list(set(map(lambda x: tuple(x), s_table.data))) if distinct else s_table.data + + if order_by: + s_table.order_by(order_by, desc) + if isinstance(limit,str): + s_table.data = [row for row in s_table.data if any(row)][:int(limit)] + + if (flag): + return s_table.data, dict + else: + return s_table + + + def _select_where(self, return_columns, condition=None, distinct=False, order_by=None, desc=True, limit=None, flag = False): ''' Select and return a table containing specified columns and rows where condition is met. @@ -215,38 +453,54 @@ def _select_where(self, return_columns, condition=None, distinct=False, order_by return_columns: list. The columns to be returned. condition: string. A condition using the following format: 'column[<,<=,==,>=,>]value' or - 'value[<,<=,==,>=,>]column'. - - Operatores supported: (<,<=,==,>=,>) + 'not column[<,<=,==,>=,>]value' or + 'value[<,<=,==,>=,>]column' or + + 'column[<,<=,==,>=,>]value and/or column[<,<=,==,>=,>]value and/or... ' or + 'column[<,<=,==,>=,>]value and/or not column[<,<=,==,>=,>]value and/or... ' or + 'not column[<,<=,==,>=,>]value and/or column[<,<=,==,>=,>]value and/or... ' or + 'not column[<,<=,==,>=,>]value and/or not column[<,<=,==,>=,>]value and/or ...' or + + 'column between value1 and value2 or not column[<,<=,==,>=,>]value or ... ' or + 'column between value1 and value2 or column[<,<=,==,>=,>]value or ... ' or + 'column[<,<=,==,>=,>]value or column between value1 and value2 or ... ' or + 'not column[<,<=,==,>=,>]value or column between value1 and value2 or ... '. + + Operators supported: (<,<=,==,>=,>) distinct: boolean. If True, the resulting table will contain only unique rows (False by default). order_by: string. A column name that signals that the resulting table should be ordered based on it (no order if None). desc: boolean. If True, order_by will return results in descending order (False by default). limit: int. An integer that defines the number of rows that will be returned (all rows if None). ''' - + # if * return all columns, else find the column indexes for the columns specified if return_columns == '*': return_cols = [i for i in range(len(self.column_names))] else: return_cols = [self.column_names.index(col.strip()) for col in return_columns.split(',')] - + # if condition is None, return all rows # if not, return the rows with values where condition is met for value if condition is not None: + column_name, operator, value = self._parse_condition(condition) column = self.column_by_name(column_name) + rows = [ind for ind, x in enumerate(column) if get_op(operator, x, value)] + else: rows = [i for i in range(len(self.data))] # copy the old dict, but only the rows and columns of data with index in rows/columns (the indexes that we want returned) dict = {(key):([[self.data[i][j] for j in return_cols] for i in rows] if key=="data" else value) for key,value in self.__dict__.items()} - + + # we need to set the new column names/types and no of columns, since we might # only return some columns dict['column_names'] = [self.column_names[i] for i in return_cols] dict['column_types'] = [self.column_types[i] for i in return_cols] + #print(dict['column_names']) s_table = Table(load=dict) s_table.data = list(set(map(lambda x: tuple(x), s_table.data))) if distinct else s_table.data @@ -267,29 +521,52 @@ def _select_where(self, return_columns, condition=None, distinct=False, order_by if isinstance(limit,str): s_table.data = [row for row in s_table.data if any(row)][:int(limit)] - return s_table + #pd.eval(s_table) + #print(s_table.data) + if (flag): + return s_table.data, dict + else: + return s_table def _select_where_with_btree(self, return_columns, bt, condition, distinct=False, order_by=None, desc=True, limit=None): + print("Select where with btree.") # if * return all columns, else find the column indexes for the columns specified if return_columns == '*': return_cols = [i for i in range(len(self.column_names))] else: - return_cols = [self.column_names.index(colname) for colname in return_columns] + return_cols = [self.column_names.index(col.strip()) for col in return_columns.split(',')] + column_name, operator, value = self._parse_condition(condition) - - # if the column in condition is not a primary key, abort the select - if column_name != self.column_names[self.pk_idx]: - print('Column is not PK. Aborting') - + #print("self first",self.data) + #self.order_by(column_name, desc=True) + #print("self first1",self.data) + #self.order_by(column_name, desc=True) + #print("\nself after",self.data) + + #print("column name is: ",column_name) + #print("operator is: ",operator) + #print("value is: ",value) + + flag = False + for i in self.unique_cols_idx: + if column_name == self.column_names[i]: + flag = True + break + + # if the column in condition is not a primary key or unique, abort the select + if (flag is False and self.pk_idx and column_name != self.column_names[self.pk_idx]): + print('Column is not unique or PK. Aborting') + # here we run the same select twice, sequentially and using the btree. # we then check the results match and compare performance (number of operation) column = self.column_by_name(column_name) - + #print(column) # sequential + rows1 = [] opsseq = 0 for ind, x in enumerate(column): @@ -298,12 +575,87 @@ def _select_where_with_btree(self, return_columns, bt, condition, distinct=False rows1.append(ind) # btree find + #print("btree is: ",bt.show()) rows = bt.find(operator, value) + #print("rows1 are: ", rows1) + #print("rows from btree are: ", rows) + + try: + k = int(limit) + except TypeError: + k = None + + # same as simple select from now on + + rows = rows[:k] + # TODO: this needs to be dumbed down + dict = {(key):([[self.data[i][j] for j in return_cols] for i in rows] if key=="data" else value) for key,value in self.__dict__.items()} + + dict['column_names'] = [self.column_names[i] for i in return_cols] + dict['column_types'] = [self.column_types[i] for i in return_cols] + + s_table = Table(load=dict) + + s_table.data = list(set(map(lambda x: tuple(x), s_table.data))) if distinct else s_table.data + + if order_by: + s_table.order_by(order_by, desc) + + if isinstance(limit,str): + s_table.data = [row for row in s_table.data if row is not None][:int(limit)] + + return s_table + + + def _select_where_with_hash(self, return_columns, h, condition, distinct=False, order_by=None, desc=True, limit=None): + + print("Select where with hash.\n") + # if * return all columns, else find the column indexes for the columns specified + if return_columns == '*': + return_cols = [i for i in range(len(self.column_names))] + else: + return_cols = [self.column_names.index(col.strip()) for col in return_columns.split(',')] + + + column_name, operator, value = self._parse_condition(condition) + + flag = False + for i in self.unique_cols_idx: + if column_name == self.column_names[i]: + flag = True + break + + # if the column in condition is not a primary key or unique, abort the select + if (flag is False and self.pk_idx and column_name != self.column_names[self.pk_idx]): + print('Column is not unique or PK. Aborting') + + # here we run the same select twice, sequentially and using the hash. + # we then check the results match and compare performance (number of operation) + column = self.column_by_name(column_name) + #print(column) + # sequential + + rows1 = [] + opsseq = 0 + for ind, x in enumerate(column): + opsseq+=1 + if get_op(operator, x, value): + rows1.append(ind) + #print("rows1: ",rows1) + + # hash find + print("\nExtendible hashing:") + h.show() + print("\n") + rows = h.find(operator,value) + #print("rows: ",rows) + try: k = int(limit) except TypeError: k = None + # same as simple select from now on rows = rows[:k] # TODO: this needs to be dumbed down @@ -323,7 +675,9 @@ def _select_where_with_btree(self, return_columns, bt, condition, distinct=False s_table.data = [row for row in s_table.data if row is not None][:int(limit)] return s_table + + def order_by(self, column_name, desc=True): ''' Order table based on column. @@ -412,6 +766,7 @@ def _inner_join(self, table_right: Table, condition): return join_table + def _left_join(self, table_right: Table, condition): ''' Perform a left join on the table with the supplied table (right). @@ -442,6 +797,7 @@ def _left_join(self, table_right: Table, condition): return join_table + def _right_join(self, table_right: Table, condition): ''' Perform a right join on the table with the supplied table (right). @@ -472,6 +828,7 @@ def _right_join(self, table_right: Table, condition): return join_table + def _full_join(self, table_right: Table, condition): ''' Perform a full join on the table with the supplied table (right). @@ -513,6 +870,7 @@ def _full_join(self, table_right: Table, condition): return join_table + def show(self, no_of_rows=None, is_locked=False): ''' Print the table in a nice readable format. @@ -530,9 +888,26 @@ def show(self, no_of_rows=None, is_locked=False): # headers -> "column name (column type)" headers = [f'{col} ({tp.__name__})' for col, tp in zip(self.column_names, self.column_types)] - if self.pk_idx is not None: + #print(headers) + for c in range(len(self.column_names)): + if self.column_names[c] == self.pk: + headers[c] = headers[c]+' #PK#' + break + #if self.pk_idx is not None and self.pk in self.column_names: # table has a primary key, add PK next to the appropriate column - headers[self.pk_idx] = headers[self.pk_idx]+' #PK#' + #headers[self.pk_idx] = headers[self.pk_idx]+' #PK#' + + for c in range(len(self.column_names)): + if self.unique is not None and self.column_names[c] in self.unique: + headers[c] = headers[c]+' #UQ#' + ''' + if self.unique_cols_idx is not None: + # table has been declared as unique, add UQ next to the appropriate column + for c in self.unique_cols_idx: + print(self.unique_cols_idx) + print(self.unique) + headers[c] = headers[c]+' #UQ#' + ''' # detect the rows that are no tfull of nones (these rows have been deleted) # if we dont skip these rows, the returning table has empty rows at the deleted positions non_none_rows = [row for row in self.data if any(row)] @@ -547,7 +922,9 @@ def _parse_condition(self, condition, join=False): Args: condition: string. A condition using the following format: 'column[<,<=,==,>=,>]value' or - 'value[<,<=,==,>=,>]column'. + 'not column[<,<=,==,>=,>]value' or + 'value[<,<=,==,>=,>]column' or + 'column between value1 and value2' . Operatores supported: (<,<=,==,>=,>) join: boolean. Whether to join or not (False by default). @@ -557,11 +934,20 @@ def _parse_condition(self, condition, join=False): return split_condition(condition) # cast the value with the specified column's type and return the column name, the operator and the casted value - left, op, right = split_condition(condition) + + if (condition[:4] == 'not '): + left, op, right = split_not_condition(condition) + + else: + left, op, right = split_condition(condition) + if left not in self.column_names: raise ValueError(f'Condition is not valid (cant find column name)') coltype = self.column_types[self.column_names.index(left)] + if op == 'between': + #print("between has been found") + return left, op, right return left, op, coltype(right) @@ -577,3 +963,193 @@ def _load_from_file(self, filename): f.close() self.__dict__.update(tmp_dict.__dict__) + + + def _select_where_or(self, return_columns, condition=None, distinct=False, order_by=None, desc=True, limit=None): + ''' + Select and return a table containing specified columns and rows where condition is met. + + Args: + return_columns: list. The columns to be returned. + condition: string. A condition using the following format: + 'column[<,<=,==,>=,>]value or column[<,<=,==,>=,>]value or... ' or + 'not column[<,<=,==,>=,>]value or column[<,<=,==,>=,>]value or... ' or + 'not column[<,<=,==,>=,>]value or not column[<,<=,==,>=,>]value or ... ' or + 'column between value1 and value2 or not column[<,<=,==,>=,>]value or ... ' or + 'column between value1 and value2 or column[<,<=,==,>=,>]value or ... ' or + 'column[<,<=,==,>=,>]value or column between value1 and value2 or ... ' or + 'not column[<,<=,==,>=,>]value or column between value1 and value2 or ... '. + + Operatores supported: (<,<=,==,>=,>) + distinct: boolean. If True, the resulting table will contain only unique rows (False by default). + order_by: string. A column name that signals that the resulting table should be ordered based on it (no order if None). + desc: boolean. If True, order_by will return results in descending order (False by default). + limit: int. An integer that defines the number of rows that will be returned (all rows if None). + ''' + + # if * return all columns, else find the column indexes for the columns specified + if return_columns == '*': + return_cols = [i for i in range(len(self.column_names))] + else: + return_cols = [self.column_names.index(col.strip()) for col in return_columns.split(',')] + + operator = ' or ' + splt = condition.split(operator) + if (len(splt)!=0): # if there are any conditions on the left and on the right side of or operator + rows = [] + for s in splt: + column_name, operator, value = self._parse_condition(s) + column = self.column_by_name(column_name) + + #rows.append([ind for ind, x in enumerate(column) if get_op(operator, x, value)]) + for ind, x in enumerate(column): + if get_op(operator, x, value) and ind not in rows: + rows.append(ind) + + #print("rows are: ",rows) + try: + k = int(limit) + except TypeError: + k = None + + rows = rows[:k] + + dict = {(key):([[self.data[i][j] for j in return_cols] for i in rows] if key=="data" else value) for key,value in self.__dict__.items()} + + dict['column_names'] = [self.column_names[i] for i in return_cols] + dict['column_types'] = [self.column_types[i] for i in return_cols] + + s_table = Table(load=dict) + s_table.data = list(set(map(lambda x: tuple(x), s_table.data))) if distinct else s_table.data + + if order_by: + s_table.order_by(order_by, desc) + + if isinstance(limit,str): + s_table.data = [row for row in s_table.data if row is not None][:int(limit)] + + return s_table + + + def _select_where_and(self, return_columns, condition=None, distinct=False, order_by=None, desc=True, limit=None, flag = False): + ''' + Select and return a table containing specified columns and rows where condition is met. + + Args: + return_columns: list. The columns to be returned. + condition: string. A condition using the following format: + 'column[<,<=,==,>=,>]value and column[<,<=,==,>=,>]value and... ' or + 'not column[<,<=,==,>=,>]value and column[<,<=,==,>=,>]value and... ' or + 'not column[<,<=,==,>=,>]value and not column[<,<=,==,>=,>]value and ...' . + + Operatores supported: (<,<=,==,>=,>) + distinct: boolean. If True, the resulting table will contain only unique rows (False by default). + order_by: string. A column name that signals that the resulting table should be ordered based on it (no order if None). + desc: boolean. If True, order_by will return results in descending order (False by default). + limit: int. An integer that defines the number of rows that will be returned (all rows if None). + ''' + + # if * return all columns, else find the column indexes for the columns specified + if return_columns == '*': + return_cols = [i for i in range(len(self.column_names))] + else: + return_cols = [self.column_names.index(col.strip()) for col in return_columns.split(',')] + + + operator = ' and ' + splt = condition.split(operator) + + if (len(splt)!=0): # if there are any conditions on the left and on the right side of and operator + + column_name, operator, value = self._parse_condition(splt[0]) + column = self.column_by_name(column_name) + + rows = [ind for ind, x in enumerate(column) if get_op(operator, x, value)] + + for cond in splt[1:]: + column_name, operator, value = self._parse_condition(cond) + column = self.column_by_name(column_name) + rows1 = [ind for ind, x in enumerate(column) if get_op(operator, x, value)] + + rows = [c for c in rows if c in rows1] + if len(rows) == 0: # no common element + break + + # copy the old dict, but only the rows and columns of data with index in rows/columns (the indexes that we want returned) + dict = {(key):([[self.data[i][j] for j in return_cols] for i in rows] if key=="data" else value) for key,value in self.__dict__.items()} + + # we need to set the new column names/types and no of columns, since we might + # only return some columns + dict['column_names'] = [self.column_names[i] for i in return_cols] + dict['column_types'] = [self.column_types[i] for i in return_cols] + + s_table = Table(load=dict) + s_table.data = list(set(map(lambda x: tuple(x), s_table.data))) if distinct else s_table.data + + if order_by: + s_table.order_by(order_by, desc) + if isinstance(limit,str): + s_table.data = [row for row in s_table.data if any(row)][:int(limit)] + + if (flag): + return s_table.data, dict + else: + return s_table + + + def _select_where_and_or(self, return_columns, condition=None, distinct=False, order_by=None, desc=True, limit=None): + + # if * return all columns, else find the column indexes for the columns specified + ''' + if return_columns == '*': + return_cols = [i for i in range(len(self.column_names))] + else: + return_cols = [self.column_names.index(col.strip()) for col in return_columns.split(',')] + ''' + + data = [] + operator1 = ' and ' + operator2 = ' or ' + + splt = condition.split(operator2) + #print(splt) + + # dict -> in order to get the new column names, since we might only return some columns + if (operator1 in splt[0]): # and in condition -> call it's method + dict = self._select_where_and(return_columns, splt[0], distinct, order_by, desc, limit, True)[1] + else: + dict = self._select_where(return_columns, splt[0], distinct, order_by, desc, limit, True)[1] + + for cond in splt: + if (operator1 in cond): # and in condition + data.append(self._select_where_and(return_columns, cond, distinct, order_by, desc, limit, True)[0]) + else: + data.append(self._select_where(return_columns, cond, distinct, order_by, desc, limit, True)[0]) + + self = Table(load=dict) + #print(data) + data1 = [elem for twod in data for elem in twod] # convert 3D list into a 2D list + + # remove duplicate records but first sort the list + data1.sort() + new_list = list(l for l, _ in itertools.groupby(data1)) + + self.data = new_list # final data + return self + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/sql_files/smallRelationsInsertFile.sql b/sql_files/smallRelationsInsertFile.sql index d05d81b9..34e45393 100644 --- a/sql_files/smallRelationsInsertFile.sql +++ b/sql_files/smallRelationsInsertFile.sql @@ -1,10 +1,10 @@ -create table classroom (building str, room_number str, capacity int); +create table classroom (building str unique, room_number str unique, capacity int); create table department (dept_name str primary key, building str, budget int); -create table course (course_id str primary key, title str, dept_name str, credits int); -create table instructor (ID str primary key, name str, dept_name str, salary int); -create table section (course_id str, sec_id str, semester str, year int, building str, room_number str, time_slot_id str); +create table course (course_id str primary key, title str unique, dept_name str unique, credits int); +create table instructor (ID str primary key, name str unique, dept_name str, salary int); +create table section (course_id str, sec_id str, semester str unique, year int, building str, room_number str, time_slot_id str); create table teaches (ID str, course_id str, sec_id str, semester str, year int); -create table student (ID str primary key, name str, dept_name str, tot_cred int); +create table student (ID str primary key, name str, dept_name str unique, tot_cred int); create table takes (ID str, course_id str, sec_id str, semester str, year int, grade str); create table advisor (s_ID str primary key, i_ID str); create table time_slot (time_slot_id str, day str, start_hr int, start_min int, end_hr str, end_min str);