diff --git a/modules/snowflake-analytics/images/kmeans.png b/modules/snowflake-analytics/images/kmeans.png new file mode 100644 index 00000000..8b3cdf78 Binary files /dev/null and b/modules/snowflake-analytics/images/kmeans.png differ diff --git a/modules/snowflake-analytics/nav.adoc b/modules/snowflake-analytics/nav.adoc index 6672af66..122c8f8f 100644 --- a/modules/snowflake-analytics/nav.adoc +++ b/modules/snowflake-analytics/nav.adoc @@ -1,5 +1,6 @@ ** xref:index.adoc[Graph Analytics for Snowflake] *** xref:neo4j-retail-recs.adoc[Create Better Recommendations with Node Similarity] +*** xref:neo4j-marketing-segmentation.adoc[Segment Based on Behavior with Louvain] *** xref:neo4j-fraud.adoc[Discover Fraudulent Communities] *** xref:neo4j-insurance-fraud.adoc[Discover Fraudulent Insurance Claims with Embeddings] *** xref:neo4j-manufacturing.adoc[Manage Risk with a Digital Twin] diff --git a/modules/snowflake-analytics/pages/neo4j-marketing-segmentation.adoc b/modules/snowflake-analytics/pages/neo4j-marketing-segmentation.adoc new file mode 100644 index 00000000..44996d03 --- /dev/null +++ b/modules/snowflake-analytics/pages/neo4j-marketing-segmentation.adoc @@ -0,0 +1,354 @@ + + += Segment Based on Customer Behavior - Not Characteristics - with Louvain + +Every meaningful marketing decision starts with segmentation—who to +target, what to recommend, and how to personalize an experience. The +quality of those decisions depends entirely on how customers are +grouped. + +There are two fundamentally different ways to think about customer +segmentation. We can group customers by who they are, or by what they +do. + +== Overview + +=== What Is Neo4j Graph Analytics For Snowflake? + +Neo4j helps organizations find hidden relationships and patterns across +billions of data connections deeply, easily, and quickly. *Neo4j Graph +Analytics for Snowflake* brings to the power of graph directly to +Snowflake, allowing users to run 65{plus} ready-to-use algorithms on +their data, all without leaving Snowflake! + +=== Prerequisites + +* The Native App +https://app.snowflake.com/marketplace/listing/GZTDZH40CN[Neo4j Graph +Analytics] for Snowflake + +=== What You Will Need + +* A https://signup.snowflake.com/?utm_cta=quickstarts[Snowflake account] +with appropriate access to databases and schemas. +* Neo4j Graph Analytics application installed from the Snowflake +marketplace. Access the marketplace via the menu bar on the left hand +side of your screen, as seen below: +image:marketplace.png[image] + +=== What You Will Build + +* A method to identify communities that are at high risk of fraud in P2P +networks + +=== What You Will Learn + +* How to prepare and project your data for graph analytics +* How to read and write directly from and to your snowflake tables + +== Loading The Data + +Dataset overview : This dataset is a subset of instakart data and can be +found +https://github.com/neo4j-product-examples/aura-graph-analytics/tree/main/better_recommendations/data[here]. + +Let’s name our database `RETAIL++_++RECS`. We are going to add five new +tables: + +* One called `aisles` based on the aisles.csv +* One called `baskets` based on baskets.csv +* One called `departments` based on departments.csv +* One called `order++_++history` based on order++_++history.csv +* One called `products` based on products.csv + +Follow the steps found +https://docs.snowflake.com/en/user-guide/data-load-web-ui[here] to load +in your data. + +Then switch to that database: + +.... +USE DATABASE RETAIL_RECS; +USE SCHEMA PUBLIC; +.... + +== Segmentation by Characteristics + +In a traditional approach like K-means, each customer is first reduced +to a small set of characteristics. These characteristics are fixed +numeric attributes — things like average basket size or reorder tendency +— that at best, these approaches cluster customers by summarized +behavior and at worst, by characteristics that entirely miss how they +are actually behaving. + +Once every customer is represented as a point in feature space, K-means +groups together customers who look numerically similar. + +Let’s first prep our data for this approach: + +[source,sql] +---- +CREATE OR REPLACE TABLE ORDER_ITEM_COUNTS AS +SELECT + order_id, + COUNT(*) AS items_in_order +FROM BASKETS +GROUP BY 1; +---- + +We are going to create two features: reorder rate and average items per +order. These features do get at a given customer’s behavior, +specifically how many things they buy each time and how often they +reorder a item. + +[source,sql] +---- +CREATE OR REPLACE TABLE USER_KMEANS_FEATURES AS +WITH ORDER_STATS AS ( + SELECT + order_id, + COUNT(*) AS items_in_order, + AVG(reordered::FLOAT) AS reorder_rate_in_order + FROM BASKETS + GROUP BY 1 +) +SELECT + o.user_id, + AVG(s.items_in_order::FLOAT) AS avg_items_per_order, + AVG(s.reorder_rate_in_order) AS reorder_rate +FROM ORDERS o +JOIN ORDER_STATS s + ON o.order_id = s.order_id +GROUP BY 1; +---- + +[source,sql] +---- +select * from user_kmeans_features +---- + +[cols=",,",options="header",] +|=== +|USER++_++ID |AVG++_++ITEMS++_++PER++_++ORDER |REORDER++_++RATE +|202279 |9 |0.6666666666666666 +|205970 |8 |1 +|178520 |13 |0.9230769230769231 +|156122 |26 |0.8076923076923077 +|142903 |2 |0 +|=== + +Now that we have our features, we will take them out of SQL and load +them into python where we will perform our analysis. With Graph +Analytics for Snowflake, you can avoid this data movement and run your +algorithm directly on SQL tables. + +[source,python] +---- +from snowflake.snowpark.context import get_active_session + +session = get_active_session() + +df = session.table("USER_KMEANS_FEATURES").select( + "USER_ID", "AVG_ITEMS_PER_ORDER", "REORDER_RATE" +).dropna() + +pdf = df.to_pandas() +pdf.head() +---- + +[cols=",,",options="header",] +|=== +|USER++_++ID |AVG++_++ITEMS++_++PER++_++ORDER |REORDER++_++RATE +|202279 |9 |0.6666666666666666 +|205970 |8 |1 +|178520 |13 |0.9230769230769231 +|156122 |26 |0.8076923076923077 +|142903 |2 |0 +|=== + +[source,python] +---- +import os +os.environ["LOKY_MAX_CPU_COUNT"] = "1" +os.environ["JOBLIB_MULTIPROCESSING"] = "0" +---- + +[source,python] +---- +from sklearn.preprocessing import MinMaxScaler +from sklearn.cluster import KMeans + +X = pdf[["AVG_ITEMS_PER_ORDER", "REORDER_RATE"]].to_numpy() + +X_scaled = MinMaxScaler().fit_transform(X) + +km = KMeans(n_clusters=3, n_init=10, random_state=42, algorithm="lloyd") +pdf["CLUSTER"] = km.fit_predict(X_scaled) +---- + +Let’s take a look at our final result. We sorted our customers into +three distinct groups based on how much they buy when they go to the +store and how many of their items are reorders. + +But these are summaries of customer’s behavior, and if we pause and +consider what exactly we are modeling, we will be able to poke some +holes. Does a teacher reordering pencils form a meaningful group with a +new father reordering baby formula? From a marketing standpoint, likely +not. + +When customer behavior is flattened into these summaries, we necessarily +lose information. That is the fundamental challenge in modeling based on +characteristics rather than the behavior itself. + +[source,python] +---- +import matplotlib.pyplot as plt + +plt.figure() +plt.scatter( + pdf["AVG_ITEMS_PER_ORDER"], + pdf["REORDER_RATE"], + c=pdf["CLUSTER"] +) +plt.xlabel("avg_items_per_order") +plt.ylabel("reorder_rate") +plt.title("K-means customer clusters") +plt.show() +---- +image:kmeans.png[image] + +== Model Based on Behavior with Louvain + +With a graph-based approach, we don’t summarize behavior into +attributes. Instead, we model behavior directly. + +Customer segmentation in graph analytics mirrors how students choose +tables in a cafeteria. Students who interact frequently sit together; +likewise, customers who interact with the same items naturally form +groups. Thats how graph algorithms, like louvain, segment customers +based on directly on their behavior + +In our example, customers are connected to the products they purchase, +forming a network of interactions. + +Louvain finds communities in this network — groups of customers who are +tightly connected through shared purchasing behavior. + +We will create three tables. Two are node tables for the customers and +the products. The third is an edge list showing customer purchase +events. + +[source,sql] +---- +CREATE OR REPLACE VIEW CUSTOMERS AS +SELECT DISTINCT + user_id AS nodeId +FROM ORDERS; + +CREATE OR REPLACE VIEW PRODUCTS_NODES AS +SELECT + product_id AS nodeId +FROM PRODUCTS; + +CREATE OR REPLACE VIEW PURCHASES AS +SELECT + o.user_id AS sourceNodeId, + b.product_id AS targetNodeId +FROM ORDERS o +JOIN BASKETS b + ON o.order_id = b.order_id; +---- + +== Granting Permissions + +Next we grant the necessary permissions: + +[source,sql] +---- +USE ROLE ACCOUNTADMIN; +---- + +[source,sql] +---- +-- Create a consumer role for users and admins of the GDS application +CREATE ROLE IF NOT EXISTS gds_user_role; +CREATE ROLE IF NOT EXISTS gds_admin_role; +GRANT APPLICATION ROLE neo4j_graph_analytics.app_user TO ROLE gds_user_role; +GRANT APPLICATION ROLE neo4j_graph_analytics.app_admin TO ROLE gds_admin_role; + +CREATE DATABASE ROLE IF NOT EXISTS gds_db_role; +GRANT DATABASE ROLE gds_db_role TO ROLE gds_user_role; +GRANT DATABASE ROLE gds_db_role TO APPLICATION neo4j_graph_analytics; + +-- Grant access to consumer data +GRANT USAGE ON DATABASE RETAIL_RECS TO ROLE gds_user_role; +GRANT USAGE ON SCHEMA RETAIL_RECS.PUBLIC TO ROLE gds_user_role; + +-- Required to read tabular data into a graph +GRANT SELECT ON ALL TABLES IN DATABASE RETAIL_RECS TO DATABASE ROLE gds_db_role; + +-- Ensure the consumer role has access to created tables/views +GRANT ALL PRIVILEGES ON FUTURE TABLES IN SCHEMA RETAIL_RECS.PUBLIC TO DATABASE ROLE gds_db_role; +GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA RETAIL_RECS.PUBLIC TO DATABASE ROLE gds_db_role; +GRANT CREATE TABLE ON SCHEMA RETAIL_RECS.PUBLIC TO DATABASE ROLE gds_db_role; +GRANT CREATE VIEW ON SCHEMA RETAIL_RECS.PUBLIC TO DATABASE ROLE gds_db_role; +GRANT ALL PRIVILEGES ON FUTURE VIEWS IN SCHEMA RETAIL_RECS.PUBLIC TO DATABASE ROLE gds_db_role; +GRANT ALL PRIVILEGES ON ALL VIEWS IN SCHEMA RETAIL_RECS.PUBLIC TO DATABASE ROLE gds_db_role; + +-- Compute and warehouse access +GRANT USAGE ON WAREHOUSE NEO4J_GRAPH_ANALYTICS_APP_WAREHOUSE TO APPLICATION neo4j_graph_analytics; +---- + +[source,sql] +---- +use role gds_role; +---- + +== Running our Algorithms + +We apply Louvain to a customer–item purchase graph to identify +communities of customers with similar buying behavior. These communities +emerge from shared purchase patterns, rather than predefined customer +attributes. + +[source,sql] +---- +CALL Neo4j_Graph_Analytics.graph.louvain('CPU_X64_XS', { + 'defaultTablePrefix': 'retail_recs.public', + 'project': { + 'nodeTables': [ 'CUSTOMERS', 'PRODUCTS_NODES'], + 'relationshipTables': { + 'PURCHASES': { + 'sourceTable': 'CUSTOMERS', + 'targetTable': 'PRODUCTS_NODES', + 'orientation': 'UNDIRECTED' + } + } + }, + 'compute': { + 'mutateProperty': 'community_id' + }, + 'write': [{ + 'nodeLabel': 'CUSTOMERS', + 'outputTable': 'CUSTOMERS_GROUP', + 'nodeProperty': 'community_id' + }] +}); +---- + +And now we have a different customer segments derived directly from +those customer’s behavior rather than the customer’s characteristics: + +[source,sql] +---- +select community_id, count(*) as node_count from customers_group group by community_id +---- + +[cols=",",options="header",] +|=== +|COMMUNITY++_++ID |NODE++_++COUNT +|4970 |91 +|4952 |43 +|5277 |47 +|5380 |30 +|=== diff --git a/modules/snowflake-analytics/pages/neo4j-retail-recs.adoc b/modules/snowflake-analytics/pages/neo4j-retail-recs.adoc index c5cf0fc2..55d2defe 100644 --- a/modules/snowflake-analytics/pages/neo4j-retail-recs.adoc +++ b/modules/snowflake-analytics/pages/neo4j-retail-recs.adoc @@ -47,7 +47,7 @@ with appropriate access to databases and schemas. * Neo4j Graph Analytics application installed from the Snowflake marketplace. Access the marketplace via the menu bar on the left hand side of your screen, as seen below: -image:/Users/corydonbaylor/Documents/md2adoc/md/assets/marketplace.png[image] +image:marketplace.png[image] === What You Will Build