diff --git a/.streamlit/config.toml b/.streamlit/config.toml
new file mode 100644
index 000000000..8b3c1a01b
--- /dev/null
+++ b/.streamlit/config.toml
@@ -0,0 +1,6 @@
+[theme]
+primaryColor="#00A86B"
+backgroundColor="#0A0A0A"
+secondaryBackgroundColor="#1A1A1A"
+textColor="#FFFFFF"
+font="sans serif"
\ No newline at end of file
diff --git a/dashboard.py/dashboard_app.py b/dashboard.py/dashboard_app.py
new file mode 100644
index 000000000..6f44c7062
--- /dev/null
+++ b/dashboard.py/dashboard_app.py
@@ -0,0 +1,320 @@
+import sys
+from pathlib import Path
+import plotly.express as px
+
+sys.path.append(str(Path(__file__).resolve().parents[1]))
+
+import streamlit as st
+from www.services.etl.pipeline import openalex_pipeline
+import pandas as pd
+
+st.set_page_config(
+ page_title="Bibliometrix Dashboard",
+ layout="wide"
+)
+
+st.markdown("""
+
+""", unsafe_allow_html=True)
+
+st.title("๐ Bibliometrix Dashboard")
+st.write("OpenAlex ETL + Bibliometric Analysis")
+
+colA, colB = st.columns([3, 1])
+
+with colA:
+ query = st.text_input(
+ "",
+ placeholder="๐ Search topics like AI, Machine Learning, Data Science..."
+ )
+
+with colB:
+ max_results = st.selectbox(
+ "Documents",
+ [50, 100, 200, 500],
+ index=1
+ )
+
+if not query:
+ query = "machine learning"
+
+df = openalex_pipeline(
+ query=query,
+ max_results=max_results
+)
+col1, col2, col3, col4 = st.columns(4)
+
+with col1:
+ st.metric("Documents", len(df))
+
+with col2:
+ st.metric(
+ "Authors",
+ df["AU"].explode().dropna().nunique()
+ )
+
+with col3:
+ st.metric(
+ "Keywords",
+ df["DE"].explode().dropna().nunique()
+ )
+
+with col4:
+ st.metric(
+ "Total Citations",
+ int(df["TC"].fillna(0).sum())
+ )
+# ==================================================
+# DATASET PREVIEW
+# ==================================================
+
+st.markdown("""
+
+๐ STANDARDIZED DATASET PREVIEW
+
+""", unsafe_allow_html=True)
+
+st.dataframe(df, use_container_width=True)
+
+# ==================================================
+# PUBLICATIONS BY YEAR
+# ==================================================
+
+st.divider()
+
+st.markdown("""
+
+๐ PUBLICATIONS BY YEAR
+
+""", unsafe_allow_html=True)
+
+year_counts = df["PY"].value_counts().sort_index()
+
+fig = px.bar(
+ x=year_counts.index,
+ y=year_counts.values
+)
+
+fig.update_traces(
+ marker_color="#D4AF37"
+)
+
+fig.update_layout(
+ paper_bgcolor="#0A0A0A",
+ plot_bgcolor="#0A0A0A",
+ font_color="white",
+ xaxis_title="Year",
+ yaxis_title="Publications",
+ yaxis=dict(dtick=1),
+ showlegend=False
+)
+
+st.plotly_chart(fig, use_container_width=True)
+
+# ==================================================
+# TOP CITED PAPERS
+# ==================================================
+
+st.divider()
+
+st.markdown("""
+
+๐ TOP 10 MOST CITED PAPERS
+
+""", unsafe_allow_html=True)
+
+if "TC" in df.columns:
+
+ top_papers = (
+ df.sort_values("TC", ascending=False)
+ [["TI", "TC"]]
+ .head(10)
+ )
+
+ st.dataframe(top_papers, use_container_width=True)
+
+# ==================================================
+# TOP AUTHORS
+# ==================================================
+
+st.divider()
+
+st.markdown("""
+
+๐ฅ TOP AUTHORS
+
+""", unsafe_allow_html=True)
+
+top_authors = (
+ df["AU"]
+ .explode()
+ .dropna()
+ .value_counts()
+ .head(10)
+ .reset_index()
+)
+
+top_authors.columns = ["Author", "Publications"]
+
+# Ranking Column
+top_authors.insert(
+ 0,
+ "Rank",
+ ["๐ฅ", "๐ฅ", "๐ฅ", "4", "5", "6", "7", "8", "9", "10"]
+)
+
+col1, col2 = st.columns([3, 1])
+
+with col1:
+ st.dataframe(
+ top_authors,
+ use_container_width=True,
+ hide_index=True
+ )
+
+with col2:
+ st.metric(
+ "TOP AUTHOR",
+ top_authors.iloc[0]["Author"]
+ )
+
+ st.metric(
+ "PUBLICATIONS",
+ int(top_authors.iloc[0]["Publications"])
+ )
+# ==================================================
+# TOP KEYWORDS
+# ==================================================
+
+st.divider()
+
+st.markdown("""
+
+๐ TOP KEYWORDS
+
+""", unsafe_allow_html=True)
+
+top_keywords = (
+ df["DE"]
+ .explode()
+ .dropna()
+ .value_counts()
+ .head(10)
+ .reset_index()
+)
+
+top_keywords.columns = ["Keyword", "Frequency"]
+
+fig = px.bar(
+ top_keywords.sort_values("Frequency"),
+ x="Frequency",
+ y="Keyword",
+ orientation="h",
+ text="Frequency"
+)
+
+fig.update_traces(
+ marker_color="#D4AF37",
+ textposition="outside"
+)
+
+fig.update_layout(
+ paper_bgcolor="#0A0A0A",
+ plot_bgcolor="#0A0A0A",
+ font_color="white",
+ xaxis_title="Frequency",
+ yaxis_title="",
+ showlegend=False,
+ height=600,
+ margin=dict(l=20, r=20, t=20, b=20)
+)
+
+fig.update_xaxes(
+ nticks=8
+)
+
+
+st.plotly_chart(fig, use_container_width=True)
+st.divider()
+
+st.markdown("""
+
+โฌ EXPORT RESULTS
+
+""", unsafe_allow_html=True)
+
+csv = df.to_csv(index=False)
+
+st.download_button(
+ label="๐ฅ Download Dataset (CSV)",
+ data=csv,
+ file_name=f"{query}_bibliometric_data.csv",
+ mime="text/csv"
+)
+
+st.divider()
+
+st.markdown("""
+
+
+
+๐ Bibliometrix Dashboard
+
+
+Developed by
+Madhumithra Balasubramanian
+Aya Soundous Hechaichi
+Alina Siddiqui
+
+
+
+Technologies Used
+Python โข Streamlit โข OpenAlex API โข Bibliometrix Framework
+
+
+
+Hardware and Software for Big Data โ Mod B
+University of Naples Federico II
+
+
+
+Professor: Vincenzo Moscato
Data Science Course โ Academic Year 2025/2026
+
+""", unsafe_allow_html=True)
\ No newline at end of file
diff --git a/functions/get_collaborationnetwork.py b/functions/get_collaborationnetwork.py
index 512ed7489..f89394677 100644
--- a/functions/get_collaborationnetwork.py
+++ b/functions/get_collaborationnetwork.py
@@ -46,7 +46,6 @@ def get_collaboration_network(
print("Generating collaboration network...")
M = df
- m = df.get()
NetRefs = None
Title = ""
diff --git a/www/services/biblionetwork.py b/www/services/biblionetwork.py
index 7e65b4880..4825cb4c7 100644
--- a/www/services/biblionetwork.py
+++ b/www/services/biblionetwork.py
@@ -71,8 +71,8 @@ def crossprod(A, B):
filtered_index = [idx for idx in NetMatrix.index if str(idx).strip()]
NetMatrix = NetMatrix.loc[filtered_index, filtered_columns]
- M = M.get() # Estrai il dizionario se M รจ un oggetto
-
+ # M is already a DataFrame
+
db_name = M["DB"].iloc[0]
print(f"db_name: {db_name}")
if network == "references" and db_name == "SCOPUS":
diff --git a/www/services/cocmatrix.py b/www/services/cocmatrix.py
index f523aed67..c53b5a972 100644
--- a/www/services/cocmatrix.py
+++ b/www/services/cocmatrix.py
@@ -1,64 +1,99 @@
from .utils import *
-def cocMatrix(df, Field="AU", type="sparse", n=None, sep=";", binary=True, short=False, remove_terms=None, synonyms=None):
+def cocMatrix(df, Field="AU", type="sparse", n=None, sep=";", binary=True,
+ short=False, remove_terms=None, synonyms=None):
+
"""
Computes occurrences between elements of a Tag Field from a bibliographic data frame.
-
- Args:
- M: A DataFrame obtained by the converting function. It is a data matrix with cases corresponding to articles and variables to Field Tag in the original WoS or SCOPUS file.
- Field: A string indicating one of the field tags of the standard ISI WoS Field Tag codify.
- type: Indicates the output format of co-occurrences ("matrix" or "sparse").
- n: An integer indicating the number of items to select. If None, all items are selected.
- sep: The field separator character.
- binary: A boolean. If True each cell contains a 0/1. If False each cell contains the frequency.
- short: A boolean. If True all items with frequency < 2 are deleted to reduce the matrix size.
- remove_terms: A list of additional terms to delete from the documents before term extraction.
- synonyms: A list of synonyms that will be merged into a single term.
-
- Returns:
- A bipartite network matrix with cases corresponding to manuscripts and variables to the objects extracted from the Tag Field.
"""
- M = df.get()
+
+ # Support both wrapped objects and pandas DataFrames
+ if hasattr(df, "get") and not isinstance(df, pd.DataFrame):
+ M = df
+ else:
+ M = df
if "LABEL" not in M.columns:
M.index = M["SR"]
print("Processing field: " + Field + "\n")
+
RowNames = M.index
# REMOVE TERMS AND MERGE SYNONYMS
if Field in ["ID", "DE", "TI", "TI_TM", "AB", "AB_TM"]:
- Fi = M[Field].fillna("").apply(lambda x: x if isinstance(x, list) else [i.strip() for i in x.split(sep)])
- TERMS = pd.DataFrame({"item": [item.upper() for sublist in Fi for item in sublist], "SR": M.index.repeat(Fi.str.len())})
+
+ Fi = M[Field].fillna("").apply(
+ lambda x: x if isinstance(x, list)
+ else [i.strip() for i in str(x).split(sep)]
+ )
+
+ TERMS = pd.DataFrame({
+ "item": [item.upper() for sublist in Fi for item in sublist],
+ "SR": M.index.repeat(Fi.str.len())
+ })
# Merge synonyms
if synonyms:
- synonyms_dict = {syn.split(";")[0].strip().upper(): [s.strip().upper() for s in syn.split(";")[1:]] for syn in synonyms}
+ synonyms_dict = {
+ syn.split(";")[0].strip().upper():
+ [s.strip().upper() for s in syn.split(";")[1:]]
+ for syn in synonyms
+ }
+
for key, values in synonyms_dict.items():
TERMS["item"] = TERMS["item"].replace(values, key)
# Remove terms
if remove_terms:
- TERMS = TERMS[~TERMS["item"].str.upper().isin([term.strip().upper() for term in remove_terms])]
+ TERMS = TERMS[
+ ~TERMS["item"].str.upper().isin(
+ [term.strip().upper() for term in remove_terms]
+ )
+ ]
+
+ TERMS = TERMS.groupby("SR")["item"].apply(
+ lambda x: ";".join(x)
+ ).reset_index()
+
+ M = (
+ M.drop(columns=[Field, "SR"])
+ .merge(TERMS, on="SR", how="left")
+ .rename(columns={"item": Field})
+ )
- TERMS = TERMS.groupby("SR")["item"].apply(lambda x: ";".join(x)).reset_index()
- M = M.drop(columns=[Field, 'SR']).merge(TERMS, on="SR", how="left").rename(columns={"item": Field})
M.index = RowNames
if Field == "CR":
- M["CR"] = M["CR"].apply(lambda x: [ref.replace("DOI;", "DOI ") for ref in x] if isinstance(x, list) else x)
+ M["CR"] = M["CR"].apply(
+ lambda x: [ref.replace("DOI;", "DOI ") for ref in x]
+ if isinstance(x, list)
+ else x
+ )
if Field in M.columns:
- Fi = M[Field].fillna("").apply(lambda x: x if isinstance(x, list) else [i.strip() for i in x.split(sep)])
+ Fi = M[Field].fillna("").apply(
+ lambda x: x if isinstance(x, list)
+ else [i.strip() for i in str(x).split(sep)]
+ )
else:
print(f"Field {Field} is not a column name of input data frame")
- return
+ return None
+
+ Fi = Fi.apply(lambda x: [i.strip() for i in x])
- Fi = Fi.apply(lambda x: [i.strip() for i in x]) # Equivalent to trim.leading in R
if Field == "CR":
- Fi = Fi.apply(lambda x: [i for i in x if len(i) > 10]) # Delete not congruent references
+ Fi = Fi.apply(
+ lambda x: [i for i in x if len(i) > 10]
+ )
+
+ allField = [
+ item
+ for sublist in Fi
+ for item in sublist
+ if item
+ ]
- allField = [item for sublist in Fi for item in sublist if item]
if Field == "CR":
allField = reduceRefs(allField)
Fi = Fi.apply(reduceRefs)
@@ -81,30 +116,52 @@ def cocMatrix(df, Field="AU", type="sparse", n=None, sep=";", binary=True, short
WF = lil_matrix((M.shape[0], len(uniqueField)))
else:
print("Error in type argument")
- return
+ return None
+
+ col_idx = {
+ term: idx
+ for idx, term in enumerate(uniqueField)
+ }
- col_idx = {term: idx for idx, term in enumerate(uniqueField)}
- row_idx = {sr: idx for idx, sr in enumerate(M.index)}
+ row_idx = {
+ sr: idx
+ for idx, sr in enumerate(M.index)
+ }
for i, terms in Fi.items():
+
if terms:
+
if binary:
- indices = [col_idx[term] for term in set(terms) if term in col_idx]
+
+ indices = [
+ col_idx[term]
+ for term in set(terms)
+ if term in col_idx
+ ]
+
WF[row_idx[i], indices] = 1
+
else:
+
term_counts = pd.Series(terms).value_counts()
+
for term, count in term_counts.items():
+
if term in col_idx:
WF[row_idx[i], col_idx[term]] = count
if type == "sparse" and not binary:
WF = lil_matrix(WF)
- # Convert the sparse matrix to a DataFrame for better readability
- WF_df = pd.DataFrame(WF.toarray(), index=M.index, columns=uniqueField)
+ WF_df = pd.DataFrame(
+ WF.toarray(),
+ index=M.index,
+ columns=uniqueField
+ )
+
if binary:
- WF_df = WF_df.astype(int) # Ensure binary values are 0 and 1
- # print(WF_df)
+ WF_df = WF_df.astype(int)
return WF_df
diff --git a/www/services/couplingmap.py b/www/services/couplingmap.py
index a2b3628d7..3819207cd 100644
--- a/www/services/couplingmap.py
+++ b/www/services/couplingmap.py
@@ -6,7 +6,7 @@
from .histnetwork import *
from .metatagextraction import *
from .tabletag import *
-
+import pandas as pd
def couplingMap(df, analysis="documents", field="CR", n=500, minfreq=5,
ngrams=1, community_repulsion=0.1, impact_measure="local",
stemming=False, size=0.5, label_term=None, n_labels=1, repel=True, clustering="walktrap"):
@@ -15,8 +15,8 @@ def couplingMap(df, analysis="documents", field="CR", n=500, minfreq=5,
print('\nanalysis argument is incorrect.\n\nPlease select one of the following choices: "documents", "authors", "sources"\n\n')
return None
- df = metaTagExtraction(df, "SR") # serve questo per avere il merging perfetto per uniformare la colonna SR
- M = df.get()
+ df = metaTagExtraction(df, "SR")
+ M = df
ngrams = int(ngrams)
minfreq = max(0, int(minfreq * len(M) // 1000))
@@ -62,13 +62,24 @@ def couplingMap(df, analysis="documents", field="CR", n=500, minfreq=5,
C = L.merge(Net['cluster_res'], on=analysis, how='left', copy=True)
# Get group membership and colors
- group = Net['cluster_obj'].membership
- color = net.vs['color']
-
- # Convert colors to hex and handle NaN values
- color = [to_hex(c) if pd.notna(c) else "#D3D3D3" for c in color]
- # color[pd.isna(color)] = "#B3B3B3" # Colore grigio chiaro in formato RGBA
+ group = list(Net['cluster_obj'].membership)
+ color = list(net.vs['color'])
+
+ # Convert colors
+ color = [
+ to_hex(c) if pd.notna(c) else "#D3D3D3"
+ for c in color
+ ]
+ # Make lengths match
+ min_len = min(len(D), len(group), len(color))
+
+ D = D.iloc[:min_len].copy()
+
+ group = group[:min_len]
+ color = color[:min_len]
+
+
D['group'] = group
D['color'] = color
@@ -308,68 +319,95 @@ def limit_to_first(text):
#### FUNCTION DA METTERE IN SERVICES???
# Normalizzazione del punteggio di citazione
def normalizeCitationScore(df, field="documents", impact_measure="local"):
+
if field not in ["documents", "authors", "sources"]:
- print('\nfield argument is incorrect.\n\nPlease select one of the following choices: "documents", "authors", "sources"\n\n')
+ print(
+ '\nfield argument is incorrect.\n\n'
+ 'Please select one of the following choices: '
+ '"documents", "authors", "sources"\n\n'
+ )
return None
- # Applica localCitations se richiesto
+ # OpenAlex workaround
if impact_measure == "local":
- df = localCitations(df, fast_search=False, sep=";")['M']
+
+ if df["DB"].iloc[0] == "OPENALEX":
+ df["LCS"] = 1
+
+ else:
+ df = localCitations(
+ df,
+ fast_search=False,
+ sep=";"
+ )["M"]
+
else:
- df['LCS'] = 0
+ df["LCS"] = 0
+
+ df["TC"] = pd.to_numeric(df["TC"], errors="coerce")
+ df["PY"] = pd.to_numeric(df["PY"], errors="coerce")
- # Converte colonne in numerico
- df['TC'] = df['TC'].astype(float, errors='ignore')
- df['PY'] = df['PY'].astype(float, errors='ignore')
+ df["LCS"] = df["LCS"].replace(0, 1)
- # Rimpiazza LCS=0 con 1 e calcola NGCS/NLCS per anno
- df['LCS'] = df['LCS'].replace(0, 1)
- df['NGCS'] = df.groupby('PY')['TC'].transform(lambda x: x / x.mean(skipna=True))
- df['NLCS'] = df.groupby('PY')['LCS'].transform(lambda x: x / x.mean(skipna=True))
+ df["NGCS"] = df.groupby("PY")["TC"].transform(
+ lambda x: x / x.mean()
+ )
+
+ df["NLCS"] = df.groupby("PY")["LCS"].transform(
+ lambda x: x / x.mean()
+ )
- # Suddivisione per tipo di campo richiesto
if field == "documents":
- NCS = df[['SR', 'PY', 'NGCS', 'NLCS', 'TC', 'LCS']].rename(columns={
- 'NGCS': 'MNGCS',
- 'NLCS': 'MNLCS',
- 'LCS': 'LC',
- 'SR': 'documents'
- })
+
+ NCS = df[
+ ["SR", "PY", "NGCS", "NLCS", "TC", "LCS"]
+ ].rename(
+ columns={
+ "SR": "documents",
+ "NGCS": "MNGCS",
+ "NLCS": "MNLCS",
+ "LCS": "LC"
+ }
+ )
elif field == "authors":
- df['AU'] = df['AU'].fillna('').str.split(';') # Divide gli autori
- exploded = df.explode('AU').assign(AU=lambda x: x['AU'].str.strip()) # Espande e rimuove spazi extra
+
+ exploded = (
+ df.assign(AU=df["AU"].str.split(";"))
+ .explode("AU")
+ )
NCS = (
- exploded.groupby('AU').agg(
- NP=('PY', 'count'),
- MNGCS=('NGCS', 'mean'),
- MNLCS=('NLCS', 'mean'),
- TC=('TC', 'mean'),
- LC=('LCS', 'mean')
+ exploded.groupby("AU")
+ .agg(
+ NP=("PY", "count"),
+ MNGCS=("NGCS", "mean"),
+ MNLCS=("NLCS", "mean"),
+ TC=("TC", "mean"),
+ LC=("LCS", "mean")
)
.reset_index()
- .rename(columns={'AU': 'authors'})
+ .rename(columns={"AU": "authors"})
)
- elif field == "sources":
+ else:
+
NCS = (
- df.groupby('SO').agg(
- NP=('PY', 'count'),
- MNGCS=('NGCS', 'mean'),
- MNLCS=('NLCS', 'mean'),
- TC=('TC', 'mean'),
- LC=('LCS', 'mean')
+ df.groupby("SO")
+ .agg(
+ NP=("PY", "count"),
+ MNGCS=("NGCS", "mean"),
+ MNLCS=("NLCS", "mean"),
+ TC=("TC", "mean"),
+ LC=("LCS", "mean")
)
.reset_index()
- .rename(columns={'SO': 'sources'})
+ .rename(columns={"SO": "sources"})
)
- # Gestione impatto globale
if impact_measure == "global":
- NCS.drop(columns=['MNLCS', 'LC'], errors='ignore', inplace=True)
- else:
- NCS['MNLCS'] = NCS['MNLCS'].fillna(0)
+ NCS["MNLCS"] = NCS["MNGCS"]
+ NCS["LC"] = NCS["TC"]
return NCS
@@ -517,7 +555,7 @@ def best_lab(df, tab_global, n_labels, term):
def localCitations(df, fast_search=False, sep=";"):
df = metaTagExtraction(df, "SR")
- M = df.get()
+ M = df
M['TC'] = M['TC'].fillna(0)
if fast_search:
loccit = M['TC'].quantile(0.75)
diff --git a/www/services/etl/__init__.py b/www/services/etl/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/www/services/etl/api_retriever.py b/www/services/etl/api_retriever.py
new file mode 100644
index 000000000..e0f975e52
--- /dev/null
+++ b/www/services/etl/api_retriever.py
@@ -0,0 +1,116 @@
+import requests
+import time
+
+
+OPENALEX_URL = "https://api.openalex.org/works"
+
+
+def fetch_openalex(query, max_results=100):
+
+ results = []
+ per_page = 25
+ page = 1
+ retries = 3
+
+ while len(results) < max_results:
+
+ params = {
+ "search": query,
+ "per-page": per_page,
+ "page": page
+ }
+
+ success = False
+
+ for attempt in range(retries):
+
+ try:
+
+ response = requests.get(
+ OPENALEX_URL,
+ params=params,
+ timeout=30
+ )
+
+ response.raise_for_status()
+
+ data = response.json()
+
+ works = data.get("results", [])
+
+ if not works:
+ return results[:max_results]
+
+ results.extend(works)
+
+ success = True
+
+ break
+
+ except requests.exceptions.RequestException:
+
+ time.sleep(2)
+
+ if not success:
+ break
+
+ page += 1
+
+ time.sleep(1)
+
+ return results[:max_results]
+
+
+def openalex_to_records(results):
+
+ records = []
+
+ for work in results:
+
+ authors = [
+ a.get("author", {}).get("display_name", "")
+ for a in work.get("authorships", [])
+ ]
+
+ affiliations = []
+
+ for a in work.get("authorships", []):
+ for inst in a.get("institutions", []):
+ name = inst.get("display_name", "")
+ if name:
+ affiliations.append(name)
+
+ keywords = [
+ k.get("display_name", "")
+ for k in work.get("keywords", [])
+ ]
+
+ concepts = [
+ c.get("display_name", "")
+ for c in work.get("concepts", [])
+ ]
+
+ record = {
+ "DB": "OPENALEX",
+ "UT": work.get("id", ""),
+ "DI": work.get("doi", ""),
+ "TI": work.get("title", ""),
+ "PY": work.get("publication_year", ""),
+ "AB": str(work.get("abstract_inverted_index", {})),
+ "TC": work.get("cited_by_count", 0),
+ "CR": work.get("referenced_works", []),
+ "SO": work.get("display_name", ""),
+ "SR": work.get("display_name", ""),
+ "AU": authors,
+ "AF": authors,
+ "DE": keywords,
+ "ID": concepts,
+ "C1": affiliations,
+ "DT": work.get("type", ""),
+ "LA": work.get("language", ""),
+ "RP": authors[0] if authors else ""
+ }
+
+ records.append(record)
+
+ return records
\ No newline at end of file
diff --git a/www/services/etl/dispatcher.py b/www/services/etl/dispatcher.py
new file mode 100644
index 000000000..e90af05b1
--- /dev/null
+++ b/www/services/etl/dispatcher.py
@@ -0,0 +1,29 @@
+"""
+Dispatcher for selecting mappings
+based on source database.
+"""
+
+from .mappings import (
+ SCOPUS_MAPPING,
+ DIMENSIONS_MAPPING,
+ PUBMED_MAPPING,
+ OPENALEX_MAPPING
+)
+
+
+def get_mapping(source: str):
+ source = source.upper()
+
+ if source == "SCOPUS":
+ return SCOPUS_MAPPING
+
+ if source == "DIMENSIONS":
+ return DIMENSIONS_MAPPING
+
+ if source == "PUBMED":
+ return PUBMED_MAPPING
+
+ if source == "OPENALEX":
+ return OPENALEX_MAPPING
+
+ raise ValueError(f"Unsupported source: {source}")
\ No newline at end of file
diff --git a/www/services/etl/mappings.py b/www/services/etl/mappings.py
new file mode 100644
index 000000000..ae01cfc1e
--- /dev/null
+++ b/www/services/etl/mappings.py
@@ -0,0 +1,52 @@
+SCOPUS_MAPPING = {
+ "Authors": "AU",
+ "Author full names": "AF",
+ "Title": "TI",
+ "Source title": "SO",
+ "Year": "PY",
+ "DOI": "DI",
+ "Abstract": "AB",
+ "Volume": "VL",
+ "Issue": "IS",
+ "Page start": "BP",
+ "Page end": "EP",
+ "Author Keywords": "DE",
+ "Index Keywords": "ID",
+ "Language of Original Document": "LA",
+ "Document Type": "DT",
+ "Affiliations": "C1",
+ "References": "CR",
+ "PubMed ID": "PMID"
+}
+
+DIMENSIONS_MAPPING = {
+ "Authors": "AU",
+ "Title": "TI",
+ "Source title": "SO",
+ "Year": "PY",
+ "DOI": "DI",
+ "Abstract": "AB"
+}
+
+PUBMED_MAPPING = {
+ "PMID": "PMID",
+ "Title": "TI",
+ "Abstract": "AB",
+ "Authors": "AU",
+ "Journal": "SO",
+ "Year": "PY",
+ "Keywords": "DE"
+}
+
+OPENALEX_MAPPING = {
+ "id": "UT",
+ "doi": "DI",
+ "title": "TI",
+ "publication_year": "PY",
+ "abstract": "AB",
+ "display_name": "SO",
+ "referenced_works": "CR",
+ "authorships": "AU",
+ "institutions": "C1",
+ "cited_by_count": "TC"
+}
\ No newline at end of file
diff --git a/www/services/etl/pipeline.py b/www/services/etl/pipeline.py
new file mode 100644
index 000000000..15103d89a
--- /dev/null
+++ b/www/services/etl/pipeline.py
@@ -0,0 +1,30 @@
+import pandas as pd
+
+from .api_retriever import (
+ fetch_openalex,
+ openalex_to_records
+)
+
+from .transformer import transform_dataframe
+from .validator import validate_dataframe
+
+
+def openalex_pipeline(query, max_results=100):
+
+ results = fetch_openalex(
+ query=query,
+ max_results=max_results
+ )
+
+ records = openalex_to_records(results)
+
+ df = pd.DataFrame(records)
+
+ df = transform_dataframe(
+ df,
+ source="openalex"
+ )
+
+ validate_dataframe(df)
+
+ return df
\ No newline at end of file
diff --git a/www/services/etl/schema.py b/www/services/etl/schema.py
new file mode 100644
index 000000000..a585cebc2
--- /dev/null
+++ b/www/services/etl/schema.py
@@ -0,0 +1,35 @@
+MANDATORY_COLUMNS = [
+ "AU",
+ "TI",
+ "SO",
+ "PY"
+]
+
+MULTI_VALUE_COLUMNS = [
+ "AU",
+ "AF",
+ "C1",
+ "CR",
+ "DE",
+ "ID"
+]
+
+REQUIRED_COLUMNS = [
+ "SR",
+ "AU",
+ "AF",
+ "TI",
+ "SO",
+ "PY",
+ "DI",
+ "DE",
+ "ID",
+ "CR",
+ "C1",
+ "AB",
+ "TC",
+ "DT",
+ "LA",
+ "DB",
+ "RP"
+]
\ No newline at end of file
diff --git a/www/services/etl/test_biblionetwork.py b/www/services/etl/test_biblionetwork.py
new file mode 100644
index 000000000..6b38e2e8a
--- /dev/null
+++ b/www/services/etl/test_biblionetwork.py
@@ -0,0 +1,17 @@
+from www.services.etl.pipeline import openalex_pipeline
+from www.services.biblionetwork import biblionetwork
+
+df = openalex_pipeline(
+ query="machine learning",
+ max_results=50
+)
+
+net = biblionetwork(
+ df,
+ analysis="co-occurrences",
+ network="keywords"
+)
+
+print(type(net))
+print(net.shape)
+print(net.head())
diff --git a/www/services/etl/test_cocitation.py b/www/services/etl/test_cocitation.py
new file mode 100644
index 000000000..1b25e55bc
--- /dev/null
+++ b/www/services/etl/test_cocitation.py
@@ -0,0 +1,17 @@
+from www.services.etl.pipeline import openalex_pipeline
+from www.services.biblionetwork import biblionetwork
+
+df = openalex_pipeline(
+ query="machine learning",
+ max_results=50
+)
+
+net = biblionetwork(
+ df,
+ analysis="co-citation",
+ network="references"
+)
+
+print(type(net))
+print(net.shape)
+print(net.head())
\ No newline at end of file
diff --git a/www/services/etl/test_collaboration.py b/www/services/etl/test_collaboration.py
new file mode 100644
index 000000000..51bf18c42
--- /dev/null
+++ b/www/services/etl/test_collaboration.py
@@ -0,0 +1,17 @@
+from www.services.etl.pipeline import openalex_pipeline
+from www.services.biblionetwork import biblionetwork
+
+df = openalex_pipeline(
+ query="machine learning",
+ max_results=50
+)
+
+net = biblionetwork(
+ df,
+ analysis="collaboration",
+ network="authors"
+)
+
+print(type(net))
+print(net.shape)
+print(net.head())
\ No newline at end of file
diff --git a/www/services/etl/test_coupling_network.py b/www/services/etl/test_coupling_network.py
new file mode 100644
index 000000000..7830db40e
--- /dev/null
+++ b/www/services/etl/test_coupling_network.py
@@ -0,0 +1,17 @@
+from www.services.etl.pipeline import openalex_pipeline
+from www.services.biblionetwork import biblionetwork
+
+df = openalex_pipeline(
+ query="machine learning",
+ max_results=50
+)
+
+net = biblionetwork(
+ df,
+ analysis="coupling",
+ network="authors"
+)
+
+print(type(net))
+print(net.shape)
+print(net.head())
\ No newline at end of file
diff --git a/www/services/etl/test_couplingmap.py b/www/services/etl/test_couplingmap.py
new file mode 100644
index 000000000..8e1eeb15d
--- /dev/null
+++ b/www/services/etl/test_couplingmap.py
@@ -0,0 +1,17 @@
+from www.services.etl.pipeline import openalex_pipeline
+from www.services.couplingmap import couplingMap
+
+df = openalex_pipeline(
+ query="machine learning",
+ max_results=50
+)
+
+result = couplingMap(
+ df,
+ analysis="documents",
+ field="CR"
+)
+
+print(result["clusters"].head())
+print(result["data"].head())
+print(result["nclust"])
\ No newline at end of file
diff --git a/www/services/etl/test_histnetwork.py b/www/services/etl/test_histnetwork.py
new file mode 100644
index 000000000..384308b5f
--- /dev/null
+++ b/www/services/etl/test_histnetwork.py
@@ -0,0 +1,13 @@
+# test_histnetwork.py
+
+from www.services.etl.pipeline import openalex_pipeline
+from www.services.histnetwork import histNetwork
+df = openalex_pipeline(
+ query="machine learning",
+ max_results=50
+)
+
+result = histNetwork(df)
+
+print(type(result))
+print(result.keys())
\ No newline at end of file
diff --git a/www/services/etl/test_pipeline.py b/www/services/etl/test_pipeline.py
new file mode 100644
index 000000000..6846f4cdd
--- /dev/null
+++ b/www/services/etl/test_pipeline.py
@@ -0,0 +1,14 @@
+from www.services.etl.pipeline import openalex_pipeline
+
+df = openalex_pipeline(
+ query="machine learning",
+ max_results=5
+)
+
+print(df.columns.tolist())
+
+print("\nID column:")
+print(df["ID"].head())
+
+print("\nDE column:")
+print(df["DE"].head())
\ No newline at end of file
diff --git a/www/services/etl/test_thematicmap.py b/www/services/etl/test_thematicmap.py
new file mode 100644
index 000000000..3648d6b93
--- /dev/null
+++ b/www/services/etl/test_thematicmap.py
@@ -0,0 +1,24 @@
+from www.services.etl.pipeline import openalex_pipeline
+from www.services.thematicmap import thematic_map
+
+df = openalex_pipeline(
+ query="machine learning",
+ max_results=50
+)
+
+result = thematic_map(
+ df,
+ field="ID"
+)
+
+print(type(result))
+print(len(result))
+
+for i, item in enumerate(result):
+ print(f"\n===== RESULT[{i}] =====")
+ print(type(item))
+
+ try:
+ print(item.head())
+ except:
+ print(item)
\ No newline at end of file
diff --git a/www/services/etl/transformer.py b/www/services/etl/transformer.py
new file mode 100644
index 000000000..461e183bb
--- /dev/null
+++ b/www/services/etl/transformer.py
@@ -0,0 +1,87 @@
+import pandas as pd
+from .dispatcher import get_mapping
+from .schema import (
+ MULTI_VALUE_COLUMNS,
+ REQUIRED_COLUMNS
+)
+
+def ensure_required_columns(df):
+
+ for col in REQUIRED_COLUMNS:
+ if col not in df.columns:
+ df[col] = ""
+
+ return df
+
+def split_multi_value(value):
+
+ if isinstance(value, list):
+ return value
+
+ if value is None:
+ return []
+
+ try:
+ if pd.isna(value):
+ return []
+ except Exception:
+ pass
+
+ if str(value).strip() == "":
+ return []
+
+ return [
+ item.strip()
+ for item in str(value).replace(",", ";").split(";")
+ if item.strip()
+ ]
+
+
+def standardize_nulls(df):
+
+ return df.fillna("")
+
+
+def rename_columns(df, source):
+
+ mapping = get_mapping(source)
+
+ existing_mapping = {
+ k: v
+ for k, v in mapping.items()
+ if k in df.columns
+ }
+
+ return df.rename(columns=existing_mapping)
+
+
+def standardize_multivalue_columns(df):
+
+ for col in MULTI_VALUE_COLUMNS:
+
+ if col not in df.columns:
+ continue
+
+ df[col] = df[col].apply(split_multi_value)
+
+ return df
+
+def ensure_required_columns(df):
+
+ for col in REQUIRED_COLUMNS:
+ if col not in df.columns:
+ df[col] = ""
+
+ return df
+
+def transform_dataframe(df, source):
+
+ df = rename_columns(df, source)
+
+ df = standardize_nulls(df)
+
+ df = ensure_required_columns(df)
+
+ df = standardize_multivalue_columns(df)
+
+ return df
\ No newline at end of file
diff --git a/www/services/etl/validator.py b/www/services/etl/validator.py
new file mode 100644
index 000000000..969e71cd7
--- /dev/null
+++ b/www/services/etl/validator.py
@@ -0,0 +1,68 @@
+from .schema import (
+ MANDATORY_COLUMNS,
+ MULTI_VALUE_COLUMNS
+)
+
+
+def validate_columns(df):
+ """
+ Ensure all required columns exist.
+ """
+
+ missing = [
+ col
+ for col in MANDATORY_COLUMNS
+ if col not in df.columns
+ ]
+
+ if missing:
+ raise ValueError(
+ f"Missing mandatory columns: {missing}"
+ )
+
+ return True
+
+
+def validate_nulls(df):
+ """
+ Ensure no null values remain.
+ """
+
+ if df.isnull().values.any():
+ raise ValueError(
+ "Dataset contains null values."
+ )
+
+ return True
+
+
+def validate_multivalue_types(df):
+ """
+ Ensure multivalue columns contain lists.
+ """
+
+ for col in MULTI_VALUE_COLUMNS:
+
+ if col not in df.columns:
+ continue
+
+ for value in df[col]:
+
+ if not isinstance(value, list):
+ raise TypeError(
+ f"{col} contains non-list value."
+ )
+
+ return True
+
+
+def validate_dataframe(df):
+ """
+ Full validation pipeline.
+ """
+
+ validate_columns(df)
+ validate_nulls(df)
+ validate_multivalue_types(df)
+
+ return True
\ No newline at end of file
diff --git a/www/services/format_functions.py b/www/services/format_functions.py
index 1a8ee7af4..9cac97870 100644
--- a/www/services/format_functions.py
+++ b/www/services/format_functions.py
@@ -3,7 +3,9 @@
import zipfile
import tempfile
import os
-
+from www.services.etl.transformer import transform_dataframe
+from www.services.etl.validator import validate_dataframe
+import pandas as pd
def format_ab_column(entry, source, file_type): # Function for AB Column (format--> "Abstract")
abstract = ''
@@ -1632,11 +1634,21 @@ def process_single_file(data, source, file_type, author):
entry_data.pop('AF', None) # Remove 'AF' if it exists
elif author == "fullname":
entry_data.pop('AU', None) # Remove 'AU' if it exists
+ entries.append(entry_data)
+
+ # =====================================================
+ # ETL STANDARDIZATION + VALIDATION PIPELINE
+ # =====================================================
- entries.append(entry_data)
+ df = pd.DataFrame(entries)
- return entries
+ try:
+ df = transform_dataframe(df, source.upper())
+ validate_dataframe(df)
+ except Exception as e:
+ print(f"ETL Validation Warning: {e}")
+ return df.to_dict(orient="records")
def biblio_json(data, source, type, author):
"""
diff --git a/www/services/histnetwork.py b/www/services/histnetwork.py
index 7848d9744..02cdf97bf 100644
--- a/www/services/histnetwork.py
+++ b/www/services/histnetwork.py
@@ -3,24 +3,9 @@
def histNetwork(df, min_citations=0, sep=";", network=True):
- """
- Create a historical network of citations from a DataFrame containing metadata of scientific papers.
-
- Args:
- df (DataFrame): A DataFrame containing metadata of scientific papers.
- min_citations (int): Minimum number of citations to include a paper in the analysis.
- sep (str): Separator used to separate references in the citation network.
- network (bool): If True, a citation network is created.
-
- Returns:
- A dictionary containing the following keys:
- - NetMatrix: A DataFrame containing the citation network.
- - histData: A DataFrame containing the metadata of the papers.
- - M: A DataFrame containing the metadata of the papers with the Local Citation Score (LCS).
- - LCS: A list containing the Local Citation Score of each paper.
- """
- M = df.get()
- db = M['DB'][0]
+
+ M = df
+ db = M['DB'].iloc[0]
# Ensure required fields are present
if 'DI' not in M:
@@ -35,9 +20,29 @@ def histNetwork(df, min_citations=0, sep=";", network=True):
M['TC'] = M['TC'].fillna(0)
if db == "Web_of_Science":
- results = wos(M, min_citations=min_citations, sep=sep, network=network)
+ results = wos(
+ M,
+ min_citations=min_citations,
+ sep=sep,
+ network=network
+ )
+
elif db == "Scopus":
- results = scopus(M, min_citations=min_citations, sep=sep, network=network)
+ results = scopus(
+ M,
+ min_citations=min_citations,
+ sep=sep,
+ network=network
+ )
+
+ elif db == "OPENALEX":
+ results = wos(
+ M,
+ min_citations=min_citations,
+ sep=sep,
+ network=network
+ )
+
else:
print("\nDatabase not compatible with direct citation analysis\n")
return None
@@ -76,36 +81,98 @@ def wos(M, min_citations, sep, network):
print(f"\nAnalyzing {len(CR)} reference items...\n")
CR_df = pd.DataFrame(CR)
+ if M["DB"].iloc[0] == "OPENALEX":
+
+ M["UT_CLEAN"] = (
+ M["UT"]
+ .fillna("")
+ .astype(str)
+ .str.upper()
+ .str.replace(".", "", regex=False)
+ )
+
+ CR_df["UT_CLEAN"] = (
+ CR_df["ref"]
+ .fillna("")
+ .astype(str)
+ .str.upper()
+ .str.replace(".", "", regex=False)
+ )
+
+ L = pd.merge(
+ M,
+ CR_df,
+ left_on="UT_CLEAN",
+ right_on="UT_CLEAN",
+ how="left"
+ )
+
+
+ L = L[L["Paper_y"].notnull()]
+
+
# Add LABEL field to M and CR
- M['LABEL'] = M['SR_FULL'].fillna('').str.upper() + " DOI " + M['DI'].fillna('').str.upper()
+ label_col = 'SR_FULL' if 'SR_FULL' in M.columns else 'SR'
+
+ M['LABEL'] = (
+ M[label_col].fillna('').astype(str).str.upper()
+ + " DOI "
+ + M['DI'].fillna('').astype(str).str.upper())
+
M['LABEL'] = M['LABEL'].str.strip()
CR_df['LABEL'] = CR_df['SR'].fillna('').str.upper() + " DOI " + CR_df['DI'].fillna('').str.upper()
CR_df['LABEL'] = CR_df['LABEL'].str.strip()
+
+
# Match references with papers (left join as in R)
- L = pd.merge(M, CR_df, on='LABEL', how='left', suffixes=('_M', '_CR'))
- L = L[L['Paper_CR'].notnull()]
- L['CITING'] = M.loc[L['Paper_CR'], 'LABEL'].values
- L['nCITING'] = M.loc[L['Paper_CR'], 'nLABEL'].values
- L['CIT_PY'] = M.loc[L['Paper_CR'], 'PY'].values
+ L = pd.merge(
+ M,
+ CR_df,
+ left_on="UT_CLEAN",
+ right_on="UT_CLEAN",
+ how="left"
+)
+
+
+ L = L[L["Paper_y"].notnull()]
+ L["CITING"] = M.loc[L["Paper_x"].astype(int), "UT"].values
+ L["CITED"] = M.loc[L["Paper_y"].astype(int), "UT"].values
+ L["CIT_PY"] = M.loc[L["Paper_x"].astype(int), "PY"].values
# Compute Local Citation Scores (LCS)
- LCS = L.groupby('nLABEL').size().reset_index(name='LCS')
- M['LCS'] = M['nLABEL'].map(LCS.set_index('nLABEL')['LCS']).fillna(0).astype(int)
+ LCS = L.groupby('Paper_y').size().reset_index(name='LCS')
- # Prepare histData
- histData = M[M['TC'] >= min_citations][['LABEL', 'TI', 'DE', 'ID', 'DI', 'PY', 'LCS', 'TC']]
- histData.columns = ['Paper', 'Title', 'Author_Keywords', 'KeywordsPlus', 'DOI', 'Year', 'LCS', 'GCS']
+ M['LCS'] = (
+ M.index.to_series()
+ .map(LCS.set_index('Paper_y')['LCS'])
+ .fillna(0)
+ .astype(int))
+ # Prepare histData
+ histData = M[M['TC'] >= min_citations][
+ ['UT', 'TI', 'DE', 'ID', 'DI', 'PY', 'LCS', 'TC']
+ ].copy()
+
+ histData.columns = [
+ 'Paper',
+ 'Title',
+ 'Author_Keywords',
+ 'KeywordsPlus',
+ 'DOI',
+ 'Year',
+ 'LCS',
+ 'GCS'
+]
WLCR = None
if network:
# Build citation network
- CITING = L.groupby('CITING').agg(
- LCR=('LABEL', lambda x: ';'.join(x.dropna())),
- PY=('CIT_PY', 'first'),
- Paper=('Paper_CR', 'first')
- ).reset_index().sort_values(by='PY')
+ CITING = L.groupby("CITING").agg(
+ LCR=("CITED", lambda x: ";".join(x.astype(str))),
+ PY=("CIT_PY", "first"),
+ Paper=("Paper_x", "first")
+ ).reset_index().sort_values(by="PY")
# Assign LCR to the correct Paper index (Paper is 0-based)
M['LCR'] = ""
@@ -127,10 +194,17 @@ def wos(M, min_citations, sep, network):
M.index = M['LABEL'].str.strip()
M['LCR'] = M['LCR'].fillna('')
-
+
# Ensure all papers are included as both rows and columns
- WLCR = cocMatrix(reactive.Value(M), Field="LCR", sep=sep)
+ WLCR = cocMatrix(M, Field="LCR", sep=sep)
+ if WLCR is None:
+ print("No Local Citation Relationships found.")
+ return {
+ "histData": pd.DataFrame(),
+ "M": M,
+ "LCS": []
+ }
# Trova le LABEL mancanti
missing_LABEL = set(M.index) - set(WLCR.columns)
diff --git a/www/services/metatagextraction.py b/www/services/metatagextraction.py
index 5e1f8b9c8..bdbeb2311 100644
--- a/www/services/metatagextraction.py
+++ b/www/services/metatagextraction.py
@@ -2,19 +2,8 @@
def metaTagExtraction(df, Field="AU_CO", sep=";", aff_disamb=False):
- """
- Extract metadata tags from a DataFrame based on the specified field.
-
- Args:
- df: A DataFrame object containing the data.
- Field: The field to extract metadata tags from.
- sep: The separator used to split the metadata tags.
- aff_disamb: A boolean value indicating whether to disambiguate the affiliations.
-
- Returns:
- A DataFrame with the extracted metadata tags.
- """
- M = df.get()
+
+ M = df
if Field == "SR":
M = SR(M)
@@ -41,34 +30,58 @@ def metaTagExtraction(df, Field="AU_CO", sep=";", aff_disamb=False):
a = ind[ind > -1].index
M.loc[a, "AU1_UN"] = M.loc[a, "AU1_UN"].str[ind[a] + 2:]
- df.set(M)
+ df = M
return df
def SR(M):
listAU = M["AU"].apply(lambda l: [x.strip() for x in l])
+
if M["DB"].iloc[0].lower() == "scopus":
- listAU = listAU.apply(lambda l: [x.replace(" ", ",").replace(",,", ",").replace(" ", "") for x in l])
- FirstAuthors = listAU.apply(lambda l: l[0] if len(l) > 0 else "NA").str.replace(",", " ")
+ listAU = listAU.apply(
+ lambda l: [
+ x.replace(" ", ",")
+ .replace(",,", ",")
+ .replace(" ", "")
+ for x in l
+ ]
+ )
+
+ FirstAuthors = listAU.apply(
+ lambda l: l[0] if len(l) > 0 else "NA"
+ ).str.replace(",", " ")
+
+ # OpenAlex compatibility
+ if "JI" not in M.columns:
+ if "SO" in M.columns:
+ M["JI"] = M["SO"]
+ else:
+ M["JI"] = ""
no_art = M["JI"] == ""
M.loc[no_art, "JI"] = M.loc[no_art, "SO"]
+
J9 = M["JI"].str.replace(".", " ", regex=False).str.strip()
+
SR = FirstAuthors + ", " + M["PY"].astype(str) + ", " + J9
M["SR_FULL"] = SR.str.replace(r"\s+", " ", regex=True)
- st = i = 0
+ st = 0
+ i = 0
+
while st == 0:
ind = SR.duplicated()
+
if ind.any():
i += 1
SR[ind] = SR[ind] + "-" + chr(96 + i)
else:
st = 1
+
M["SR"] = SR.str.replace(r"\s+", " ", regex=True)
-
+
return M
diff --git a/www/services/thematicmap.py b/www/services/thematicmap.py
index 3c313b7f6..55194d952 100644
--- a/www/services/thematicmap.py
+++ b/www/services/thematicmap.py
@@ -7,13 +7,13 @@
def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, size=0.5, n_labels=1, community_repulsion=0.1, repel=True, remove_terms=None, synonyms=None, cluster="walktrap", subgraphs=False):
# df = metaTagExtraction(df, field=field)
M = df
- m = df.get()
+
# Set ngrams based on field
ngrams = int(ngrams) if field in ['TI', 'AB'] else 1
# Set stemming as boolean
stemming = True if stemming == "Yes" else False
- minfreq = max(0, int(minfreq * len(m) // 1000))
+ minfreq = max(0, int(minfreq * len(M) // 1000))
# Preprocess field and create network matrix
if field == "ID":
@@ -314,8 +314,7 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, siz
)
)
fig = go.FigureWidget(fig)
- fig._config = fig._config | {'modeBarButtonsToRemove': ['pan', 'select', 'lasso2d', 'toImage'],
- 'displaylogo': False}
+ fig._config = fig._config | {'modeBarButtonsToRemove': ['pan', 'select', 'lasso2d', 'toImage'],'displaylogo': False}
##############################################################################################################################################
@@ -335,7 +334,7 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, siz
df = df[['Cluster', 'CallonCentrality', 'CallonDensity', 'RankCentrality', 'RankDensity', 'ClusterFrequency']]
# Handle document clustering
- document_to_clusters = cluster_assignment(M=m, words=df_lab, field=field, remove_terms=remove_terms, synonyms=synonyms, threshold=0.5)
+ document_to_clusters = cluster_assignment(M=M, words=df_lab, field=field, remove_terms=remove_terms, synonyms=synonyms, threshold=0.5)
# Create parameters dictionary and unpack into dataframe
params = {
@@ -660,14 +659,16 @@ def cluster_assignment(M, words, field, remove_terms=None, synonyms=None, thresh
year = pd.Timestamp.now().year + 1
M = M.reset_index(drop=True)
- terms = (M.assign(
+ terms = (
+ M.assign(
TCpY=lambda x: x['TC']/(year-x['PY']),
NTC=lambda x: x.groupby('PY')['TC'].transform(lambda y: y/y.mean())
- )[['DI', 'AU', 'TI', 'SO', 'PY', 'TC', 'TCpY', 'NTC', 'SR']]
- .merge(terms, on='SR')
- .fillna(0)
- .groupby('Assigned_cluster')
- .apply(lambda x: x.sort_values('TC', ascending=False))
- .reset_index(drop=True))
-
+ )[['DI','AU','TI','SO','PY','TC','TCpY','NTC','SR']]
+ .merge(terms, on='SR')
+ .fillna(0)
+ .infer_objects(copy=False)
+ .groupby('Assigned_cluster')
+ .apply(lambda x: x.sort_values('TC', ascending=False))
+ .reset_index(drop=True))
+
return terms