diff --git a/.streamlit/config.toml b/.streamlit/config.toml new file mode 100644 index 000000000..8b3c1a01b --- /dev/null +++ b/.streamlit/config.toml @@ -0,0 +1,6 @@ +[theme] +primaryColor="#00A86B" +backgroundColor="#0A0A0A" +secondaryBackgroundColor="#1A1A1A" +textColor="#FFFFFF" +font="sans serif" \ No newline at end of file diff --git a/dashboard.py/dashboard_app.py b/dashboard.py/dashboard_app.py new file mode 100644 index 000000000..6f44c7062 --- /dev/null +++ b/dashboard.py/dashboard_app.py @@ -0,0 +1,320 @@ +import sys +from pathlib import Path +import plotly.express as px + +sys.path.append(str(Path(__file__).resolve().parents[1])) + +import streamlit as st +from www.services.etl.pipeline import openalex_pipeline +import pandas as pd + +st.set_page_config( + page_title="Bibliometrix Dashboard", + layout="wide" +) + +st.markdown(""" + +""", unsafe_allow_html=True) + +st.title("๐Ÿ“š Bibliometrix Dashboard") +st.write("OpenAlex ETL + Bibliometric Analysis") + +colA, colB = st.columns([3, 1]) + +with colA: + query = st.text_input( + "", + placeholder="๐Ÿ” Search topics like AI, Machine Learning, Data Science..." + ) + +with colB: + max_results = st.selectbox( + "Documents", + [50, 100, 200, 500], + index=1 + ) + +if not query: + query = "machine learning" + +df = openalex_pipeline( + query=query, + max_results=max_results +) +col1, col2, col3, col4 = st.columns(4) + +with col1: + st.metric("Documents", len(df)) + +with col2: + st.metric( + "Authors", + df["AU"].explode().dropna().nunique() + ) + +with col3: + st.metric( + "Keywords", + df["DE"].explode().dropna().nunique() + ) + +with col4: + st.metric( + "Total Citations", + int(df["TC"].fillna(0).sum()) + ) +# ================================================== +# DATASET PREVIEW +# ================================================== + +st.markdown(""" +

+๐Ÿ“„ STANDARDIZED DATASET PREVIEW +

+""", unsafe_allow_html=True) + +st.dataframe(df, use_container_width=True) + +# ================================================== +# PUBLICATIONS BY YEAR +# ================================================== + +st.divider() + +st.markdown(""" +

+๐Ÿ“Š PUBLICATIONS BY YEAR +

+""", unsafe_allow_html=True) + +year_counts = df["PY"].value_counts().sort_index() + +fig = px.bar( + x=year_counts.index, + y=year_counts.values +) + +fig.update_traces( + marker_color="#D4AF37" +) + +fig.update_layout( + paper_bgcolor="#0A0A0A", + plot_bgcolor="#0A0A0A", + font_color="white", + xaxis_title="Year", + yaxis_title="Publications", + yaxis=dict(dtick=1), + showlegend=False +) + +st.plotly_chart(fig, use_container_width=True) + +# ================================================== +# TOP CITED PAPERS +# ================================================== + +st.divider() + +st.markdown(""" +

+๐Ÿ† TOP 10 MOST CITED PAPERS +

+""", unsafe_allow_html=True) + +if "TC" in df.columns: + + top_papers = ( + df.sort_values("TC", ascending=False) + [["TI", "TC"]] + .head(10) + ) + + st.dataframe(top_papers, use_container_width=True) + +# ================================================== +# TOP AUTHORS +# ================================================== + +st.divider() + +st.markdown(""" +

+๐Ÿ‘ฅ TOP AUTHORS +

+""", unsafe_allow_html=True) + +top_authors = ( + df["AU"] + .explode() + .dropna() + .value_counts() + .head(10) + .reset_index() +) + +top_authors.columns = ["Author", "Publications"] + +# Ranking Column +top_authors.insert( + 0, + "Rank", + ["๐Ÿฅ‡", "๐Ÿฅˆ", "๐Ÿฅ‰", "4", "5", "6", "7", "8", "9", "10"] +) + +col1, col2 = st.columns([3, 1]) + +with col1: + st.dataframe( + top_authors, + use_container_width=True, + hide_index=True + ) + +with col2: + st.metric( + "TOP AUTHOR", + top_authors.iloc[0]["Author"] + ) + + st.metric( + "PUBLICATIONS", + int(top_authors.iloc[0]["Publications"]) + ) +# ================================================== +# TOP KEYWORDS +# ================================================== + +st.divider() + +st.markdown(""" +

+๐Ÿ”‘ TOP KEYWORDS +

+""", unsafe_allow_html=True) + +top_keywords = ( + df["DE"] + .explode() + .dropna() + .value_counts() + .head(10) + .reset_index() +) + +top_keywords.columns = ["Keyword", "Frequency"] + +fig = px.bar( + top_keywords.sort_values("Frequency"), + x="Frequency", + y="Keyword", + orientation="h", + text="Frequency" +) + +fig.update_traces( + marker_color="#D4AF37", + textposition="outside" +) + +fig.update_layout( + paper_bgcolor="#0A0A0A", + plot_bgcolor="#0A0A0A", + font_color="white", + xaxis_title="Frequency", + yaxis_title="", + showlegend=False, + height=600, + margin=dict(l=20, r=20, t=20, b=20) +) + +fig.update_xaxes( + nticks=8 +) + + +st.plotly_chart(fig, use_container_width=True) +st.divider() + +st.markdown(""" +

+โฌ‡ EXPORT RESULTS +

+""", unsafe_allow_html=True) + +csv = df.to_csv(index=False) + +st.download_button( + label="๐Ÿ“ฅ Download Dataset (CSV)", + data=csv, + file_name=f"{query}_bibliometric_data.csv", + mime="text/csv" +) + +st.divider() + +st.markdown(""" +
+ +

+๐Ÿ“š Bibliometrix Dashboard +

+ +Developed by
+Madhumithra Balasubramanian
+Aya Soundous Hechaichi
+Alina Siddiqui + +
+ +Technologies Used
+Python โ€ข Streamlit โ€ข OpenAlex API โ€ข Bibliometrix Framework + +
+ +Hardware and Software for Big Data โ€“ Mod B
+University of Naples Federico II + +
+ +Professor: Vincenzo Moscato
Data Science Course โ€“ Academic Year 2025/2026 +
+""", unsafe_allow_html=True) \ No newline at end of file diff --git a/functions/get_collaborationnetwork.py b/functions/get_collaborationnetwork.py index 512ed7489..f89394677 100644 --- a/functions/get_collaborationnetwork.py +++ b/functions/get_collaborationnetwork.py @@ -46,7 +46,6 @@ def get_collaboration_network( print("Generating collaboration network...") M = df - m = df.get() NetRefs = None Title = "" diff --git a/www/services/biblionetwork.py b/www/services/biblionetwork.py index 7e65b4880..4825cb4c7 100644 --- a/www/services/biblionetwork.py +++ b/www/services/biblionetwork.py @@ -71,8 +71,8 @@ def crossprod(A, B): filtered_index = [idx for idx in NetMatrix.index if str(idx).strip()] NetMatrix = NetMatrix.loc[filtered_index, filtered_columns] - M = M.get() # Estrai il dizionario se M รจ un oggetto - + # M is already a DataFrame + db_name = M["DB"].iloc[0] print(f"db_name: {db_name}") if network == "references" and db_name == "SCOPUS": diff --git a/www/services/cocmatrix.py b/www/services/cocmatrix.py index f523aed67..c53b5a972 100644 --- a/www/services/cocmatrix.py +++ b/www/services/cocmatrix.py @@ -1,64 +1,99 @@ from .utils import * -def cocMatrix(df, Field="AU", type="sparse", n=None, sep=";", binary=True, short=False, remove_terms=None, synonyms=None): +def cocMatrix(df, Field="AU", type="sparse", n=None, sep=";", binary=True, + short=False, remove_terms=None, synonyms=None): + """ Computes occurrences between elements of a Tag Field from a bibliographic data frame. - - Args: - M: A DataFrame obtained by the converting function. It is a data matrix with cases corresponding to articles and variables to Field Tag in the original WoS or SCOPUS file. - Field: A string indicating one of the field tags of the standard ISI WoS Field Tag codify. - type: Indicates the output format of co-occurrences ("matrix" or "sparse"). - n: An integer indicating the number of items to select. If None, all items are selected. - sep: The field separator character. - binary: A boolean. If True each cell contains a 0/1. If False each cell contains the frequency. - short: A boolean. If True all items with frequency < 2 are deleted to reduce the matrix size. - remove_terms: A list of additional terms to delete from the documents before term extraction. - synonyms: A list of synonyms that will be merged into a single term. - - Returns: - A bipartite network matrix with cases corresponding to manuscripts and variables to the objects extracted from the Tag Field. """ - M = df.get() + + # Support both wrapped objects and pandas DataFrames + if hasattr(df, "get") and not isinstance(df, pd.DataFrame): + M = df + else: + M = df if "LABEL" not in M.columns: M.index = M["SR"] print("Processing field: " + Field + "\n") + RowNames = M.index # REMOVE TERMS AND MERGE SYNONYMS if Field in ["ID", "DE", "TI", "TI_TM", "AB", "AB_TM"]: - Fi = M[Field].fillna("").apply(lambda x: x if isinstance(x, list) else [i.strip() for i in x.split(sep)]) - TERMS = pd.DataFrame({"item": [item.upper() for sublist in Fi for item in sublist], "SR": M.index.repeat(Fi.str.len())}) + + Fi = M[Field].fillna("").apply( + lambda x: x if isinstance(x, list) + else [i.strip() for i in str(x).split(sep)] + ) + + TERMS = pd.DataFrame({ + "item": [item.upper() for sublist in Fi for item in sublist], + "SR": M.index.repeat(Fi.str.len()) + }) # Merge synonyms if synonyms: - synonyms_dict = {syn.split(";")[0].strip().upper(): [s.strip().upper() for s in syn.split(";")[1:]] for syn in synonyms} + synonyms_dict = { + syn.split(";")[0].strip().upper(): + [s.strip().upper() for s in syn.split(";")[1:]] + for syn in synonyms + } + for key, values in synonyms_dict.items(): TERMS["item"] = TERMS["item"].replace(values, key) # Remove terms if remove_terms: - TERMS = TERMS[~TERMS["item"].str.upper().isin([term.strip().upper() for term in remove_terms])] + TERMS = TERMS[ + ~TERMS["item"].str.upper().isin( + [term.strip().upper() for term in remove_terms] + ) + ] + + TERMS = TERMS.groupby("SR")["item"].apply( + lambda x: ";".join(x) + ).reset_index() + + M = ( + M.drop(columns=[Field, "SR"]) + .merge(TERMS, on="SR", how="left") + .rename(columns={"item": Field}) + ) - TERMS = TERMS.groupby("SR")["item"].apply(lambda x: ";".join(x)).reset_index() - M = M.drop(columns=[Field, 'SR']).merge(TERMS, on="SR", how="left").rename(columns={"item": Field}) M.index = RowNames if Field == "CR": - M["CR"] = M["CR"].apply(lambda x: [ref.replace("DOI;", "DOI ") for ref in x] if isinstance(x, list) else x) + M["CR"] = M["CR"].apply( + lambda x: [ref.replace("DOI;", "DOI ") for ref in x] + if isinstance(x, list) + else x + ) if Field in M.columns: - Fi = M[Field].fillna("").apply(lambda x: x if isinstance(x, list) else [i.strip() for i in x.split(sep)]) + Fi = M[Field].fillna("").apply( + lambda x: x if isinstance(x, list) + else [i.strip() for i in str(x).split(sep)] + ) else: print(f"Field {Field} is not a column name of input data frame") - return + return None + + Fi = Fi.apply(lambda x: [i.strip() for i in x]) - Fi = Fi.apply(lambda x: [i.strip() for i in x]) # Equivalent to trim.leading in R if Field == "CR": - Fi = Fi.apply(lambda x: [i for i in x if len(i) > 10]) # Delete not congruent references + Fi = Fi.apply( + lambda x: [i for i in x if len(i) > 10] + ) + + allField = [ + item + for sublist in Fi + for item in sublist + if item + ] - allField = [item for sublist in Fi for item in sublist if item] if Field == "CR": allField = reduceRefs(allField) Fi = Fi.apply(reduceRefs) @@ -81,30 +116,52 @@ def cocMatrix(df, Field="AU", type="sparse", n=None, sep=";", binary=True, short WF = lil_matrix((M.shape[0], len(uniqueField))) else: print("Error in type argument") - return + return None + + col_idx = { + term: idx + for idx, term in enumerate(uniqueField) + } - col_idx = {term: idx for idx, term in enumerate(uniqueField)} - row_idx = {sr: idx for idx, sr in enumerate(M.index)} + row_idx = { + sr: idx + for idx, sr in enumerate(M.index) + } for i, terms in Fi.items(): + if terms: + if binary: - indices = [col_idx[term] for term in set(terms) if term in col_idx] + + indices = [ + col_idx[term] + for term in set(terms) + if term in col_idx + ] + WF[row_idx[i], indices] = 1 + else: + term_counts = pd.Series(terms).value_counts() + for term, count in term_counts.items(): + if term in col_idx: WF[row_idx[i], col_idx[term]] = count if type == "sparse" and not binary: WF = lil_matrix(WF) - # Convert the sparse matrix to a DataFrame for better readability - WF_df = pd.DataFrame(WF.toarray(), index=M.index, columns=uniqueField) + WF_df = pd.DataFrame( + WF.toarray(), + index=M.index, + columns=uniqueField + ) + if binary: - WF_df = WF_df.astype(int) # Ensure binary values are 0 and 1 - # print(WF_df) + WF_df = WF_df.astype(int) return WF_df diff --git a/www/services/couplingmap.py b/www/services/couplingmap.py index a2b3628d7..3819207cd 100644 --- a/www/services/couplingmap.py +++ b/www/services/couplingmap.py @@ -6,7 +6,7 @@ from .histnetwork import * from .metatagextraction import * from .tabletag import * - +import pandas as pd def couplingMap(df, analysis="documents", field="CR", n=500, minfreq=5, ngrams=1, community_repulsion=0.1, impact_measure="local", stemming=False, size=0.5, label_term=None, n_labels=1, repel=True, clustering="walktrap"): @@ -15,8 +15,8 @@ def couplingMap(df, analysis="documents", field="CR", n=500, minfreq=5, print('\nanalysis argument is incorrect.\n\nPlease select one of the following choices: "documents", "authors", "sources"\n\n') return None - df = metaTagExtraction(df, "SR") # serve questo per avere il merging perfetto per uniformare la colonna SR - M = df.get() + df = metaTagExtraction(df, "SR") + M = df ngrams = int(ngrams) minfreq = max(0, int(minfreq * len(M) // 1000)) @@ -62,13 +62,24 @@ def couplingMap(df, analysis="documents", field="CR", n=500, minfreq=5, C = L.merge(Net['cluster_res'], on=analysis, how='left', copy=True) # Get group membership and colors - group = Net['cluster_obj'].membership - color = net.vs['color'] - - # Convert colors to hex and handle NaN values - color = [to_hex(c) if pd.notna(c) else "#D3D3D3" for c in color] - # color[pd.isna(color)] = "#B3B3B3" # Colore grigio chiaro in formato RGBA + group = list(Net['cluster_obj'].membership) + color = list(net.vs['color']) + + # Convert colors + color = [ + to_hex(c) if pd.notna(c) else "#D3D3D3" + for c in color + ] + # Make lengths match + min_len = min(len(D), len(group), len(color)) + + D = D.iloc[:min_len].copy() + + group = group[:min_len] + color = color[:min_len] + + D['group'] = group D['color'] = color @@ -308,68 +319,95 @@ def limit_to_first(text): #### FUNCTION DA METTERE IN SERVICES??? # Normalizzazione del punteggio di citazione def normalizeCitationScore(df, field="documents", impact_measure="local"): + if field not in ["documents", "authors", "sources"]: - print('\nfield argument is incorrect.\n\nPlease select one of the following choices: "documents", "authors", "sources"\n\n') + print( + '\nfield argument is incorrect.\n\n' + 'Please select one of the following choices: ' + '"documents", "authors", "sources"\n\n' + ) return None - # Applica localCitations se richiesto + # OpenAlex workaround if impact_measure == "local": - df = localCitations(df, fast_search=False, sep=";")['M'] + + if df["DB"].iloc[0] == "OPENALEX": + df["LCS"] = 1 + + else: + df = localCitations( + df, + fast_search=False, + sep=";" + )["M"] + else: - df['LCS'] = 0 + df["LCS"] = 0 + + df["TC"] = pd.to_numeric(df["TC"], errors="coerce") + df["PY"] = pd.to_numeric(df["PY"], errors="coerce") - # Converte colonne in numerico - df['TC'] = df['TC'].astype(float, errors='ignore') - df['PY'] = df['PY'].astype(float, errors='ignore') + df["LCS"] = df["LCS"].replace(0, 1) - # Rimpiazza LCS=0 con 1 e calcola NGCS/NLCS per anno - df['LCS'] = df['LCS'].replace(0, 1) - df['NGCS'] = df.groupby('PY')['TC'].transform(lambda x: x / x.mean(skipna=True)) - df['NLCS'] = df.groupby('PY')['LCS'].transform(lambda x: x / x.mean(skipna=True)) + df["NGCS"] = df.groupby("PY")["TC"].transform( + lambda x: x / x.mean() + ) + + df["NLCS"] = df.groupby("PY")["LCS"].transform( + lambda x: x / x.mean() + ) - # Suddivisione per tipo di campo richiesto if field == "documents": - NCS = df[['SR', 'PY', 'NGCS', 'NLCS', 'TC', 'LCS']].rename(columns={ - 'NGCS': 'MNGCS', - 'NLCS': 'MNLCS', - 'LCS': 'LC', - 'SR': 'documents' - }) + + NCS = df[ + ["SR", "PY", "NGCS", "NLCS", "TC", "LCS"] + ].rename( + columns={ + "SR": "documents", + "NGCS": "MNGCS", + "NLCS": "MNLCS", + "LCS": "LC" + } + ) elif field == "authors": - df['AU'] = df['AU'].fillna('').str.split(';') # Divide gli autori - exploded = df.explode('AU').assign(AU=lambda x: x['AU'].str.strip()) # Espande e rimuove spazi extra + + exploded = ( + df.assign(AU=df["AU"].str.split(";")) + .explode("AU") + ) NCS = ( - exploded.groupby('AU').agg( - NP=('PY', 'count'), - MNGCS=('NGCS', 'mean'), - MNLCS=('NLCS', 'mean'), - TC=('TC', 'mean'), - LC=('LCS', 'mean') + exploded.groupby("AU") + .agg( + NP=("PY", "count"), + MNGCS=("NGCS", "mean"), + MNLCS=("NLCS", "mean"), + TC=("TC", "mean"), + LC=("LCS", "mean") ) .reset_index() - .rename(columns={'AU': 'authors'}) + .rename(columns={"AU": "authors"}) ) - elif field == "sources": + else: + NCS = ( - df.groupby('SO').agg( - NP=('PY', 'count'), - MNGCS=('NGCS', 'mean'), - MNLCS=('NLCS', 'mean'), - TC=('TC', 'mean'), - LC=('LCS', 'mean') + df.groupby("SO") + .agg( + NP=("PY", "count"), + MNGCS=("NGCS", "mean"), + MNLCS=("NLCS", "mean"), + TC=("TC", "mean"), + LC=("LCS", "mean") ) .reset_index() - .rename(columns={'SO': 'sources'}) + .rename(columns={"SO": "sources"}) ) - # Gestione impatto globale if impact_measure == "global": - NCS.drop(columns=['MNLCS', 'LC'], errors='ignore', inplace=True) - else: - NCS['MNLCS'] = NCS['MNLCS'].fillna(0) + NCS["MNLCS"] = NCS["MNGCS"] + NCS["LC"] = NCS["TC"] return NCS @@ -517,7 +555,7 @@ def best_lab(df, tab_global, n_labels, term): def localCitations(df, fast_search=False, sep=";"): df = metaTagExtraction(df, "SR") - M = df.get() + M = df M['TC'] = M['TC'].fillna(0) if fast_search: loccit = M['TC'].quantile(0.75) diff --git a/www/services/etl/__init__.py b/www/services/etl/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/www/services/etl/api_retriever.py b/www/services/etl/api_retriever.py new file mode 100644 index 000000000..e0f975e52 --- /dev/null +++ b/www/services/etl/api_retriever.py @@ -0,0 +1,116 @@ +import requests +import time + + +OPENALEX_URL = "https://api.openalex.org/works" + + +def fetch_openalex(query, max_results=100): + + results = [] + per_page = 25 + page = 1 + retries = 3 + + while len(results) < max_results: + + params = { + "search": query, + "per-page": per_page, + "page": page + } + + success = False + + for attempt in range(retries): + + try: + + response = requests.get( + OPENALEX_URL, + params=params, + timeout=30 + ) + + response.raise_for_status() + + data = response.json() + + works = data.get("results", []) + + if not works: + return results[:max_results] + + results.extend(works) + + success = True + + break + + except requests.exceptions.RequestException: + + time.sleep(2) + + if not success: + break + + page += 1 + + time.sleep(1) + + return results[:max_results] + + +def openalex_to_records(results): + + records = [] + + for work in results: + + authors = [ + a.get("author", {}).get("display_name", "") + for a in work.get("authorships", []) + ] + + affiliations = [] + + for a in work.get("authorships", []): + for inst in a.get("institutions", []): + name = inst.get("display_name", "") + if name: + affiliations.append(name) + + keywords = [ + k.get("display_name", "") + for k in work.get("keywords", []) + ] + + concepts = [ + c.get("display_name", "") + for c in work.get("concepts", []) + ] + + record = { + "DB": "OPENALEX", + "UT": work.get("id", ""), + "DI": work.get("doi", ""), + "TI": work.get("title", ""), + "PY": work.get("publication_year", ""), + "AB": str(work.get("abstract_inverted_index", {})), + "TC": work.get("cited_by_count", 0), + "CR": work.get("referenced_works", []), + "SO": work.get("display_name", ""), + "SR": work.get("display_name", ""), + "AU": authors, + "AF": authors, + "DE": keywords, + "ID": concepts, + "C1": affiliations, + "DT": work.get("type", ""), + "LA": work.get("language", ""), + "RP": authors[0] if authors else "" + } + + records.append(record) + + return records \ No newline at end of file diff --git a/www/services/etl/dispatcher.py b/www/services/etl/dispatcher.py new file mode 100644 index 000000000..e90af05b1 --- /dev/null +++ b/www/services/etl/dispatcher.py @@ -0,0 +1,29 @@ +""" +Dispatcher for selecting mappings +based on source database. +""" + +from .mappings import ( + SCOPUS_MAPPING, + DIMENSIONS_MAPPING, + PUBMED_MAPPING, + OPENALEX_MAPPING +) + + +def get_mapping(source: str): + source = source.upper() + + if source == "SCOPUS": + return SCOPUS_MAPPING + + if source == "DIMENSIONS": + return DIMENSIONS_MAPPING + + if source == "PUBMED": + return PUBMED_MAPPING + + if source == "OPENALEX": + return OPENALEX_MAPPING + + raise ValueError(f"Unsupported source: {source}") \ No newline at end of file diff --git a/www/services/etl/mappings.py b/www/services/etl/mappings.py new file mode 100644 index 000000000..ae01cfc1e --- /dev/null +++ b/www/services/etl/mappings.py @@ -0,0 +1,52 @@ +SCOPUS_MAPPING = { + "Authors": "AU", + "Author full names": "AF", + "Title": "TI", + "Source title": "SO", + "Year": "PY", + "DOI": "DI", + "Abstract": "AB", + "Volume": "VL", + "Issue": "IS", + "Page start": "BP", + "Page end": "EP", + "Author Keywords": "DE", + "Index Keywords": "ID", + "Language of Original Document": "LA", + "Document Type": "DT", + "Affiliations": "C1", + "References": "CR", + "PubMed ID": "PMID" +} + +DIMENSIONS_MAPPING = { + "Authors": "AU", + "Title": "TI", + "Source title": "SO", + "Year": "PY", + "DOI": "DI", + "Abstract": "AB" +} + +PUBMED_MAPPING = { + "PMID": "PMID", + "Title": "TI", + "Abstract": "AB", + "Authors": "AU", + "Journal": "SO", + "Year": "PY", + "Keywords": "DE" +} + +OPENALEX_MAPPING = { + "id": "UT", + "doi": "DI", + "title": "TI", + "publication_year": "PY", + "abstract": "AB", + "display_name": "SO", + "referenced_works": "CR", + "authorships": "AU", + "institutions": "C1", + "cited_by_count": "TC" +} \ No newline at end of file diff --git a/www/services/etl/pipeline.py b/www/services/etl/pipeline.py new file mode 100644 index 000000000..15103d89a --- /dev/null +++ b/www/services/etl/pipeline.py @@ -0,0 +1,30 @@ +import pandas as pd + +from .api_retriever import ( + fetch_openalex, + openalex_to_records +) + +from .transformer import transform_dataframe +from .validator import validate_dataframe + + +def openalex_pipeline(query, max_results=100): + + results = fetch_openalex( + query=query, + max_results=max_results + ) + + records = openalex_to_records(results) + + df = pd.DataFrame(records) + + df = transform_dataframe( + df, + source="openalex" + ) + + validate_dataframe(df) + + return df \ No newline at end of file diff --git a/www/services/etl/schema.py b/www/services/etl/schema.py new file mode 100644 index 000000000..a585cebc2 --- /dev/null +++ b/www/services/etl/schema.py @@ -0,0 +1,35 @@ +MANDATORY_COLUMNS = [ + "AU", + "TI", + "SO", + "PY" +] + +MULTI_VALUE_COLUMNS = [ + "AU", + "AF", + "C1", + "CR", + "DE", + "ID" +] + +REQUIRED_COLUMNS = [ + "SR", + "AU", + "AF", + "TI", + "SO", + "PY", + "DI", + "DE", + "ID", + "CR", + "C1", + "AB", + "TC", + "DT", + "LA", + "DB", + "RP" +] \ No newline at end of file diff --git a/www/services/etl/test_biblionetwork.py b/www/services/etl/test_biblionetwork.py new file mode 100644 index 000000000..6b38e2e8a --- /dev/null +++ b/www/services/etl/test_biblionetwork.py @@ -0,0 +1,17 @@ +from www.services.etl.pipeline import openalex_pipeline +from www.services.biblionetwork import biblionetwork + +df = openalex_pipeline( + query="machine learning", + max_results=50 +) + +net = biblionetwork( + df, + analysis="co-occurrences", + network="keywords" +) + +print(type(net)) +print(net.shape) +print(net.head()) diff --git a/www/services/etl/test_cocitation.py b/www/services/etl/test_cocitation.py new file mode 100644 index 000000000..1b25e55bc --- /dev/null +++ b/www/services/etl/test_cocitation.py @@ -0,0 +1,17 @@ +from www.services.etl.pipeline import openalex_pipeline +from www.services.biblionetwork import biblionetwork + +df = openalex_pipeline( + query="machine learning", + max_results=50 +) + +net = biblionetwork( + df, + analysis="co-citation", + network="references" +) + +print(type(net)) +print(net.shape) +print(net.head()) \ No newline at end of file diff --git a/www/services/etl/test_collaboration.py b/www/services/etl/test_collaboration.py new file mode 100644 index 000000000..51bf18c42 --- /dev/null +++ b/www/services/etl/test_collaboration.py @@ -0,0 +1,17 @@ +from www.services.etl.pipeline import openalex_pipeline +from www.services.biblionetwork import biblionetwork + +df = openalex_pipeline( + query="machine learning", + max_results=50 +) + +net = biblionetwork( + df, + analysis="collaboration", + network="authors" +) + +print(type(net)) +print(net.shape) +print(net.head()) \ No newline at end of file diff --git a/www/services/etl/test_coupling_network.py b/www/services/etl/test_coupling_network.py new file mode 100644 index 000000000..7830db40e --- /dev/null +++ b/www/services/etl/test_coupling_network.py @@ -0,0 +1,17 @@ +from www.services.etl.pipeline import openalex_pipeline +from www.services.biblionetwork import biblionetwork + +df = openalex_pipeline( + query="machine learning", + max_results=50 +) + +net = biblionetwork( + df, + analysis="coupling", + network="authors" +) + +print(type(net)) +print(net.shape) +print(net.head()) \ No newline at end of file diff --git a/www/services/etl/test_couplingmap.py b/www/services/etl/test_couplingmap.py new file mode 100644 index 000000000..8e1eeb15d --- /dev/null +++ b/www/services/etl/test_couplingmap.py @@ -0,0 +1,17 @@ +from www.services.etl.pipeline import openalex_pipeline +from www.services.couplingmap import couplingMap + +df = openalex_pipeline( + query="machine learning", + max_results=50 +) + +result = couplingMap( + df, + analysis="documents", + field="CR" +) + +print(result["clusters"].head()) +print(result["data"].head()) +print(result["nclust"]) \ No newline at end of file diff --git a/www/services/etl/test_histnetwork.py b/www/services/etl/test_histnetwork.py new file mode 100644 index 000000000..384308b5f --- /dev/null +++ b/www/services/etl/test_histnetwork.py @@ -0,0 +1,13 @@ +# test_histnetwork.py + +from www.services.etl.pipeline import openalex_pipeline +from www.services.histnetwork import histNetwork +df = openalex_pipeline( + query="machine learning", + max_results=50 +) + +result = histNetwork(df) + +print(type(result)) +print(result.keys()) \ No newline at end of file diff --git a/www/services/etl/test_pipeline.py b/www/services/etl/test_pipeline.py new file mode 100644 index 000000000..6846f4cdd --- /dev/null +++ b/www/services/etl/test_pipeline.py @@ -0,0 +1,14 @@ +from www.services.etl.pipeline import openalex_pipeline + +df = openalex_pipeline( + query="machine learning", + max_results=5 +) + +print(df.columns.tolist()) + +print("\nID column:") +print(df["ID"].head()) + +print("\nDE column:") +print(df["DE"].head()) \ No newline at end of file diff --git a/www/services/etl/test_thematicmap.py b/www/services/etl/test_thematicmap.py new file mode 100644 index 000000000..3648d6b93 --- /dev/null +++ b/www/services/etl/test_thematicmap.py @@ -0,0 +1,24 @@ +from www.services.etl.pipeline import openalex_pipeline +from www.services.thematicmap import thematic_map + +df = openalex_pipeline( + query="machine learning", + max_results=50 +) + +result = thematic_map( + df, + field="ID" +) + +print(type(result)) +print(len(result)) + +for i, item in enumerate(result): + print(f"\n===== RESULT[{i}] =====") + print(type(item)) + + try: + print(item.head()) + except: + print(item) \ No newline at end of file diff --git a/www/services/etl/transformer.py b/www/services/etl/transformer.py new file mode 100644 index 000000000..461e183bb --- /dev/null +++ b/www/services/etl/transformer.py @@ -0,0 +1,87 @@ +import pandas as pd +from .dispatcher import get_mapping +from .schema import ( + MULTI_VALUE_COLUMNS, + REQUIRED_COLUMNS +) + +def ensure_required_columns(df): + + for col in REQUIRED_COLUMNS: + if col not in df.columns: + df[col] = "" + + return df + +def split_multi_value(value): + + if isinstance(value, list): + return value + + if value is None: + return [] + + try: + if pd.isna(value): + return [] + except Exception: + pass + + if str(value).strip() == "": + return [] + + return [ + item.strip() + for item in str(value).replace(",", ";").split(";") + if item.strip() + ] + + +def standardize_nulls(df): + + return df.fillna("") + + +def rename_columns(df, source): + + mapping = get_mapping(source) + + existing_mapping = { + k: v + for k, v in mapping.items() + if k in df.columns + } + + return df.rename(columns=existing_mapping) + + +def standardize_multivalue_columns(df): + + for col in MULTI_VALUE_COLUMNS: + + if col not in df.columns: + continue + + df[col] = df[col].apply(split_multi_value) + + return df + +def ensure_required_columns(df): + + for col in REQUIRED_COLUMNS: + if col not in df.columns: + df[col] = "" + + return df + +def transform_dataframe(df, source): + + df = rename_columns(df, source) + + df = standardize_nulls(df) + + df = ensure_required_columns(df) + + df = standardize_multivalue_columns(df) + + return df \ No newline at end of file diff --git a/www/services/etl/validator.py b/www/services/etl/validator.py new file mode 100644 index 000000000..969e71cd7 --- /dev/null +++ b/www/services/etl/validator.py @@ -0,0 +1,68 @@ +from .schema import ( + MANDATORY_COLUMNS, + MULTI_VALUE_COLUMNS +) + + +def validate_columns(df): + """ + Ensure all required columns exist. + """ + + missing = [ + col + for col in MANDATORY_COLUMNS + if col not in df.columns + ] + + if missing: + raise ValueError( + f"Missing mandatory columns: {missing}" + ) + + return True + + +def validate_nulls(df): + """ + Ensure no null values remain. + """ + + if df.isnull().values.any(): + raise ValueError( + "Dataset contains null values." + ) + + return True + + +def validate_multivalue_types(df): + """ + Ensure multivalue columns contain lists. + """ + + for col in MULTI_VALUE_COLUMNS: + + if col not in df.columns: + continue + + for value in df[col]: + + if not isinstance(value, list): + raise TypeError( + f"{col} contains non-list value." + ) + + return True + + +def validate_dataframe(df): + """ + Full validation pipeline. + """ + + validate_columns(df) + validate_nulls(df) + validate_multivalue_types(df) + + return True \ No newline at end of file diff --git a/www/services/format_functions.py b/www/services/format_functions.py index 1a8ee7af4..9cac97870 100644 --- a/www/services/format_functions.py +++ b/www/services/format_functions.py @@ -3,7 +3,9 @@ import zipfile import tempfile import os - +from www.services.etl.transformer import transform_dataframe +from www.services.etl.validator import validate_dataframe +import pandas as pd def format_ab_column(entry, source, file_type): # Function for AB Column (format--> "Abstract") abstract = '' @@ -1632,11 +1634,21 @@ def process_single_file(data, source, file_type, author): entry_data.pop('AF', None) # Remove 'AF' if it exists elif author == "fullname": entry_data.pop('AU', None) # Remove 'AU' if it exists + entries.append(entry_data) + + # ===================================================== + # ETL STANDARDIZATION + VALIDATION PIPELINE + # ===================================================== - entries.append(entry_data) + df = pd.DataFrame(entries) - return entries + try: + df = transform_dataframe(df, source.upper()) + validate_dataframe(df) + except Exception as e: + print(f"ETL Validation Warning: {e}") + return df.to_dict(orient="records") def biblio_json(data, source, type, author): """ diff --git a/www/services/histnetwork.py b/www/services/histnetwork.py index 7848d9744..02cdf97bf 100644 --- a/www/services/histnetwork.py +++ b/www/services/histnetwork.py @@ -3,24 +3,9 @@ def histNetwork(df, min_citations=0, sep=";", network=True): - """ - Create a historical network of citations from a DataFrame containing metadata of scientific papers. - - Args: - df (DataFrame): A DataFrame containing metadata of scientific papers. - min_citations (int): Minimum number of citations to include a paper in the analysis. - sep (str): Separator used to separate references in the citation network. - network (bool): If True, a citation network is created. - - Returns: - A dictionary containing the following keys: - - NetMatrix: A DataFrame containing the citation network. - - histData: A DataFrame containing the metadata of the papers. - - M: A DataFrame containing the metadata of the papers with the Local Citation Score (LCS). - - LCS: A list containing the Local Citation Score of each paper. - """ - M = df.get() - db = M['DB'][0] + + M = df + db = M['DB'].iloc[0] # Ensure required fields are present if 'DI' not in M: @@ -35,9 +20,29 @@ def histNetwork(df, min_citations=0, sep=";", network=True): M['TC'] = M['TC'].fillna(0) if db == "Web_of_Science": - results = wos(M, min_citations=min_citations, sep=sep, network=network) + results = wos( + M, + min_citations=min_citations, + sep=sep, + network=network + ) + elif db == "Scopus": - results = scopus(M, min_citations=min_citations, sep=sep, network=network) + results = scopus( + M, + min_citations=min_citations, + sep=sep, + network=network + ) + + elif db == "OPENALEX": + results = wos( + M, + min_citations=min_citations, + sep=sep, + network=network + ) + else: print("\nDatabase not compatible with direct citation analysis\n") return None @@ -76,36 +81,98 @@ def wos(M, min_citations, sep, network): print(f"\nAnalyzing {len(CR)} reference items...\n") CR_df = pd.DataFrame(CR) + if M["DB"].iloc[0] == "OPENALEX": + + M["UT_CLEAN"] = ( + M["UT"] + .fillna("") + .astype(str) + .str.upper() + .str.replace(".", "", regex=False) + ) + + CR_df["UT_CLEAN"] = ( + CR_df["ref"] + .fillna("") + .astype(str) + .str.upper() + .str.replace(".", "", regex=False) + ) + + L = pd.merge( + M, + CR_df, + left_on="UT_CLEAN", + right_on="UT_CLEAN", + how="left" + ) + + + L = L[L["Paper_y"].notnull()] + + # Add LABEL field to M and CR - M['LABEL'] = M['SR_FULL'].fillna('').str.upper() + " DOI " + M['DI'].fillna('').str.upper() + label_col = 'SR_FULL' if 'SR_FULL' in M.columns else 'SR' + + M['LABEL'] = ( + M[label_col].fillna('').astype(str).str.upper() + + " DOI " + + M['DI'].fillna('').astype(str).str.upper()) + M['LABEL'] = M['LABEL'].str.strip() CR_df['LABEL'] = CR_df['SR'].fillna('').str.upper() + " DOI " + CR_df['DI'].fillna('').str.upper() CR_df['LABEL'] = CR_df['LABEL'].str.strip() + + # Match references with papers (left join as in R) - L = pd.merge(M, CR_df, on='LABEL', how='left', suffixes=('_M', '_CR')) - L = L[L['Paper_CR'].notnull()] - L['CITING'] = M.loc[L['Paper_CR'], 'LABEL'].values - L['nCITING'] = M.loc[L['Paper_CR'], 'nLABEL'].values - L['CIT_PY'] = M.loc[L['Paper_CR'], 'PY'].values + L = pd.merge( + M, + CR_df, + left_on="UT_CLEAN", + right_on="UT_CLEAN", + how="left" +) + + + L = L[L["Paper_y"].notnull()] + L["CITING"] = M.loc[L["Paper_x"].astype(int), "UT"].values + L["CITED"] = M.loc[L["Paper_y"].astype(int), "UT"].values + L["CIT_PY"] = M.loc[L["Paper_x"].astype(int), "PY"].values # Compute Local Citation Scores (LCS) - LCS = L.groupby('nLABEL').size().reset_index(name='LCS') - M['LCS'] = M['nLABEL'].map(LCS.set_index('nLABEL')['LCS']).fillna(0).astype(int) + LCS = L.groupby('Paper_y').size().reset_index(name='LCS') - # Prepare histData - histData = M[M['TC'] >= min_citations][['LABEL', 'TI', 'DE', 'ID', 'DI', 'PY', 'LCS', 'TC']] - histData.columns = ['Paper', 'Title', 'Author_Keywords', 'KeywordsPlus', 'DOI', 'Year', 'LCS', 'GCS'] + M['LCS'] = ( + M.index.to_series() + .map(LCS.set_index('Paper_y')['LCS']) + .fillna(0) + .astype(int)) + # Prepare histData + histData = M[M['TC'] >= min_citations][ + ['UT', 'TI', 'DE', 'ID', 'DI', 'PY', 'LCS', 'TC'] + ].copy() + + histData.columns = [ + 'Paper', + 'Title', + 'Author_Keywords', + 'KeywordsPlus', + 'DOI', + 'Year', + 'LCS', + 'GCS' +] WLCR = None if network: # Build citation network - CITING = L.groupby('CITING').agg( - LCR=('LABEL', lambda x: ';'.join(x.dropna())), - PY=('CIT_PY', 'first'), - Paper=('Paper_CR', 'first') - ).reset_index().sort_values(by='PY') + CITING = L.groupby("CITING").agg( + LCR=("CITED", lambda x: ";".join(x.astype(str))), + PY=("CIT_PY", "first"), + Paper=("Paper_x", "first") + ).reset_index().sort_values(by="PY") # Assign LCR to the correct Paper index (Paper is 0-based) M['LCR'] = "" @@ -127,10 +194,17 @@ def wos(M, min_citations, sep, network): M.index = M['LABEL'].str.strip() M['LCR'] = M['LCR'].fillna('') - + # Ensure all papers are included as both rows and columns - WLCR = cocMatrix(reactive.Value(M), Field="LCR", sep=sep) + WLCR = cocMatrix(M, Field="LCR", sep=sep) + if WLCR is None: + print("No Local Citation Relationships found.") + return { + "histData": pd.DataFrame(), + "M": M, + "LCS": [] + } # Trova le LABEL mancanti missing_LABEL = set(M.index) - set(WLCR.columns) diff --git a/www/services/metatagextraction.py b/www/services/metatagextraction.py index 5e1f8b9c8..bdbeb2311 100644 --- a/www/services/metatagextraction.py +++ b/www/services/metatagextraction.py @@ -2,19 +2,8 @@ def metaTagExtraction(df, Field="AU_CO", sep=";", aff_disamb=False): - """ - Extract metadata tags from a DataFrame based on the specified field. - - Args: - df: A DataFrame object containing the data. - Field: The field to extract metadata tags from. - sep: The separator used to split the metadata tags. - aff_disamb: A boolean value indicating whether to disambiguate the affiliations. - - Returns: - A DataFrame with the extracted metadata tags. - """ - M = df.get() + + M = df if Field == "SR": M = SR(M) @@ -41,34 +30,58 @@ def metaTagExtraction(df, Field="AU_CO", sep=";", aff_disamb=False): a = ind[ind > -1].index M.loc[a, "AU1_UN"] = M.loc[a, "AU1_UN"].str[ind[a] + 2:] - df.set(M) + df = M return df def SR(M): listAU = M["AU"].apply(lambda l: [x.strip() for x in l]) + if M["DB"].iloc[0].lower() == "scopus": - listAU = listAU.apply(lambda l: [x.replace(" ", ",").replace(",,", ",").replace(" ", "") for x in l]) - FirstAuthors = listAU.apply(lambda l: l[0] if len(l) > 0 else "NA").str.replace(",", " ") + listAU = listAU.apply( + lambda l: [ + x.replace(" ", ",") + .replace(",,", ",") + .replace(" ", "") + for x in l + ] + ) + + FirstAuthors = listAU.apply( + lambda l: l[0] if len(l) > 0 else "NA" + ).str.replace(",", " ") + + # OpenAlex compatibility + if "JI" not in M.columns: + if "SO" in M.columns: + M["JI"] = M["SO"] + else: + M["JI"] = "" no_art = M["JI"] == "" M.loc[no_art, "JI"] = M.loc[no_art, "SO"] + J9 = M["JI"].str.replace(".", " ", regex=False).str.strip() + SR = FirstAuthors + ", " + M["PY"].astype(str) + ", " + J9 M["SR_FULL"] = SR.str.replace(r"\s+", " ", regex=True) - st = i = 0 + st = 0 + i = 0 + while st == 0: ind = SR.duplicated() + if ind.any(): i += 1 SR[ind] = SR[ind] + "-" + chr(96 + i) else: st = 1 + M["SR"] = SR.str.replace(r"\s+", " ", regex=True) - + return M diff --git a/www/services/thematicmap.py b/www/services/thematicmap.py index 3c313b7f6..55194d952 100644 --- a/www/services/thematicmap.py +++ b/www/services/thematicmap.py @@ -7,13 +7,13 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, size=0.5, n_labels=1, community_repulsion=0.1, repel=True, remove_terms=None, synonyms=None, cluster="walktrap", subgraphs=False): # df = metaTagExtraction(df, field=field) M = df - m = df.get() + # Set ngrams based on field ngrams = int(ngrams) if field in ['TI', 'AB'] else 1 # Set stemming as boolean stemming = True if stemming == "Yes" else False - minfreq = max(0, int(minfreq * len(m) // 1000)) + minfreq = max(0, int(minfreq * len(M) // 1000)) # Preprocess field and create network matrix if field == "ID": @@ -314,8 +314,7 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, siz ) ) fig = go.FigureWidget(fig) - fig._config = fig._config | {'modeBarButtonsToRemove': ['pan', 'select', 'lasso2d', 'toImage'], - 'displaylogo': False} + fig._config = fig._config | {'modeBarButtonsToRemove': ['pan', 'select', 'lasso2d', 'toImage'],'displaylogo': False} ############################################################################################################################################## @@ -335,7 +334,7 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, siz df = df[['Cluster', 'CallonCentrality', 'CallonDensity', 'RankCentrality', 'RankDensity', 'ClusterFrequency']] # Handle document clustering - document_to_clusters = cluster_assignment(M=m, words=df_lab, field=field, remove_terms=remove_terms, synonyms=synonyms, threshold=0.5) + document_to_clusters = cluster_assignment(M=M, words=df_lab, field=field, remove_terms=remove_terms, synonyms=synonyms, threshold=0.5) # Create parameters dictionary and unpack into dataframe params = { @@ -660,14 +659,16 @@ def cluster_assignment(M, words, field, remove_terms=None, synonyms=None, thresh year = pd.Timestamp.now().year + 1 M = M.reset_index(drop=True) - terms = (M.assign( + terms = ( + M.assign( TCpY=lambda x: x['TC']/(year-x['PY']), NTC=lambda x: x.groupby('PY')['TC'].transform(lambda y: y/y.mean()) - )[['DI', 'AU', 'TI', 'SO', 'PY', 'TC', 'TCpY', 'NTC', 'SR']] - .merge(terms, on='SR') - .fillna(0) - .groupby('Assigned_cluster') - .apply(lambda x: x.sort_values('TC', ascending=False)) - .reset_index(drop=True)) - + )[['DI','AU','TI','SO','PY','TC','TCpY','NTC','SR']] + .merge(terms, on='SR') + .fillna(0) + .infer_objects(copy=False) + .groupby('Assigned_cluster') + .apply(lambda x: x.sort_values('TC', ascending=False)) + .reset_index(drop=True)) + return terms