Amazon

2,963 Amazonian reviews are covered in this post.

It will not be complete if we analyze Google (link), and Microsoft (link and link) but not Amazon. The scraping technique is the same except the URLs. I got 2,963 reviews from 2008 to 2017. Please note that I scraped reviews from Washington State only.

##### Load Goodies #####
library(tidyverse)
library(stringr)
library(tidytext)
library(ggraph)
library(igraph)

##### Create Theme for GGPLOT2 #####
theme_moma <- function(base_size = 12, base_family = "Helvetica") {
  theme(
    plot.background = element_rect(fill = "#F7F6ED"),
    legend.key = element_rect(fill = "#F7F6ED"),
    legend.background = element_rect(fill = "#F7F6ED"),
    panel.background = element_rect(fill = "#F7F6ED"),
    panel.border = element_rect(colour = "black", fill = NA, linetype = "dashed"),
    panel.grid.minor = element_line(colour = "#7F7F7F", linetype = "dotted"),
    panel.grid.major = element_line(colour = "#7F7F7F", linetype = "dotted")
  )
}

##### Load Goodies #####

library(tidyverse)

library(stringr)

library(tidytext)

library(ggraph)

library(igraph)

##### Create Theme for GGPLOT2 #####

theme_moma <- function(base_size = 12, base_family = "Helvetica") {

theme(

plot.background = element_rect(fill = "#F7F6ED"),

legend.key = element_rect(fill = "#F7F6ED"),

legend.background = element_rect(fill = "#F7F6ED"),

panel.background = element_rect(fill = "#F7F6ED"),

panel.border = element_rect(colour = "black", fill = NA, linetype = "dashed"),

panel.grid.minor = element_line(colour = "#7F7F7F", linetype = "dotted"),

panel.grid.major = element_line(colour = "#7F7F7F", linetype = "dotted")

)

}

Three necessary data processing.

##### Processing #####
data2 <- data[2:6]
data2 <- data2 %>%
  filter(Summary != "xx")

##### Date Management #####
data2$Posted_Date <- as.Date(data2$Posted_Date, format = " %b %d, %Y")

##### Munging with the Words #####
#Separating Employee Type and Locations
data2 <- data2%>%
  separate(col = Title, into = c("Employee_Type","Location"),sep = ' in ') %>%
  separate(col = Employee_Type, into = c("Employee_Type", "Title"), sep = 'ee - ')

data2$Employee_Type <- str_replace_all(data2$Employee_Type, "Employ","Employee")

##### Processing #####

data2 <- data[2:6]

data2 <- data2 %>%

filter(Summary != "xx")

##### Date Management #####

data2$Posted_Date <- as.Date(data2$Posted_Date, format = " %b %d, %Y")

##### Munging with the Words #####

#Separating Employee Type and Locations

data2 <- data2%>%

separate(col = Title, into = c("Employee_Type","Location"),sep = ' in ') %>%

separate(col = Employee_Type, into = c("Employee_Type", "Title"), sep = 'ee - ')

data2$Employee_Type <- str_replace_all(data2$Employee_Type, "Employ","Employee")

Before creating bigram, let’s check the title of Amazonians.

##### Title #####
data2 %>%
  group_by(Title) %>%
  summarise(count = n()) %>%
  arrange(desc(count))

##### Title #####

data2 %>%

group_by(Title) %>%

summarise(count = n()) %>%

arrange(desc(count))

> data2 %>%
+   group_by(Title) %>%
+   summarise(count = n()) %>%
+   arrange(desc(count))
# A tibble: 588 × 2
                              Title count
                              <chr> <int>
1                Anonymous Employee   414
2     Software Development Engineer   238
3  Software Development Engineer II   166
4   Software Development Engineer I   111
5                 Software Engineer    88
6            Senior Product Manager    78
7      Software Development Manager    60
8                   Product Manager    53
9                    Senior Manager    52
10                  Program Manager    49
# ... with 578 more rows

> data2 %>%

+ group_by(Title) %>%

+ summarise(count = n()) %>%

+ arrange(desc(count))

# A tibble: 588 × 2

Title count

1 Anonymous Employee 414

2 Software Development Engineer 238

3 Software Development Engineer II 166

4 Software Development Engineer I 111

5 Software Engineer 88

6 Senior Product Manager 78

7 Software Development Manager 60

8 Product Manager 53

9 Senior Manager 52

10 Program Manager 49

# ... with 578 more rows

That’s better than Google; anonymity only consists of 14.0%. Low anonymity is good as we can drill-down to see if there is any pattern in each title. But before that, we need to process the texts.

##### Change to lower #####
data2$Summary_2 <- str_replace_all(data2$Summary_2, "[:alpha:]",tolower)
data2$Pros_2 <- str_replace_all(data2$Pros_2, "[:alpha:]",tolower)
data2$Cons_2 <- str_replace_all(data2$Cons_2, "[:alpha:]",tolower)

##### Remove Punct #####
data2$Summary_2 <- str_replace_all(data2$Summary_2, "[:punct:]","")
data2$Pros_2 <- str_replace_all(data2$Pros_2, "[:punct:]","")
data2$Cons_2 <- str_replace_all(data2$Cons_2, "[:punct:]","")

##### Remove Numbers #####
data2$Summary_2 <- str_replace_all(data2$Summary_2, "[:digit:]","")
data2$Pros_2 <- str_replace_all(data2$Pros_2, "[:digit:]","")
data2$Cons_2 <- str_replace_all(data2$Cons_2, "[:digit:]","")

##### Work Life Balance & Other Words #####
worklife <- array(c("work life balance", "work-life balance",
                    "work/life balance", "work life", "work&life",
                    "worklife"))

for (i in 1:nrow(worklife)){
  print(i)
  for (j in 8:ncol(data2)) {
    data2[[j]] <- str_replace_all(data2[[j]],worklife[[i]],"wlb")
  }
}

##### Change to lower #####

data2$Summary_2 <- str_replace_all(data2$Summary_2, "[:alpha:]",tolower)

data2$Pros_2 <- str_replace_all(data2$Pros_2, "[:alpha:]",tolower)

data2$Cons_2 <- str_replace_all(data2$Cons_2, "[:alpha:]",tolower)

##### Remove Punct #####

data2$Summary_2 <- str_replace_all(data2$Summary_2, "[:punct:]","")

data2$Pros_2 <- str_replace_all(data2$Pros_2, "[:punct:]","")

data2$Cons_2 <- str_replace_all(data2$Cons_2, "[:punct:]","")

##### Remove Numbers #####

data2$Summary_2 <- str_replace_all(data2$Summary_2, "[:digit:]","")

data2$Pros_2 <- str_replace_all(data2$Pros_2, "[:digit:]","")

data2$Cons_2 <- str_replace_all(data2$Cons_2, "[:digit:]","")

##### Work Life Balance & Other Words #####

worklife <- array(c("work life balance", "work-life balance",

"work/life balance", "work life", "work&life",

"worklife"))

for (i in 1:nrow(worklife)){

print(i)

for (j in 8:ncol(data2)) {

data2[[j]] <- str_replace_all(data2[[j]],worklife[[i]],"wlb")

}

And… let’s create the chart for Pros.

#### Overall - Pros #####
#Step 1: Unnest
data_pros <- data2 %>% select(Pros_2) %>% 
  unnest_tokens(words, Pros_2, token = 'ngrams',n = 2)

#Step 2: Separate
data_pros_split <- data_pros %>%
  separate(words, c("from","to",sep = " ")) %>%
  select(1:2)

#Step 3: Remove stopwords
data_pros_clean <- data_pros_split %>%
  filter(!from %in% stop_words$word) %>%
  filter(!to %in% stop_words$word)

#Step 4: Count
data_pros_counts <- data_pros_clean %>% 
  count(from, to)

pros_bigram <- data_pros_counts %>%
  filter(n > 10) %>%
  graph_from_data_frame()

arrow_control <- grid::arrow(type = "closed", length = unit(.15, "inches"))
Pros_chart <- ggraph(pros_bigram) +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = arrow_control) +
  geom_node_point(color = "lightgreen", size = 3) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void() + theme( plot.background = element_rect(fill = "#F7F6ED")) +
  ggtitle("Pros")

Pros_chart

#### Overall - Pros #####

#Step 1: Unnest

data_pros <- data2 %>% select(Pros_2) %>%

unnest_tokens(words, Pros_2, token = 'ngrams',n = 2)

#Step 2: Separate

data_pros_split <- data_pros %>%

separate(words, c("from","to",sep = " ")) %>%

select(1:2)

#Step 3: Remove stopwords

data_pros_clean <- data_pros_split %>%

filter(!from %in% stop_words$word) %>%

filter(!to %in% stop_words$word)

#Step 4: Count

data_pros_counts <- data_pros_clean %>%

count(from, to)

pros_bigram <- data_pros_counts %>%

filter(n > 10) %>%

graph_from_data_frame()

arrow_control <- grid::arrow(type = "closed", length = unit(.15, "inches"))

Pros_chart <- ggraph(pros_bigram) +

geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,

arrow = arrow_control) +

geom_node_point(color = "lightgreen", size = 3) +

geom_node_text(aes(label = name), vjust = 1, hjust = 1) +

theme_void() + theme( plot.background = element_rect(fill = "#F7F6ED")) +

ggtitle("Pros")

Pros_chart

Most of the words are not surprising: smart people, flexible hours, stock options, wlb, etc. Amazonians seem to enjoy working in downtown Seattle. What about Cons?

#### Overall - Cons #####
#Step 1: Unnest
data_cons <- data2 %>% select(Cons_2) %>% 
  unnest_tokens(words, Cons_2, token = 'ngrams',n = 2)

#Step 2: Separate
data_cons_split <- data_cons %>%
  separate(words, c("from","to",sep = " ")) %>%
  select(1:2)

#Step 3: Remove stopwords
data_cons_clean <- data_cons_split %>%
  filter(!from %in% stop_words$word) %>%
  filter(!to %in% stop_words$word)

#Step 4: Count
data_cons_counts <- data_cons_clean %>% 
  count(from, to)

cons_bigram <- data_cons_counts %>%
  filter(n > 10) %>%
  graph_from_data_frame()

arrow_control <- grid::arrow(type = "closed", length = unit(.15, "inches"))
cons_chart <- ggraph(cons_bigram) +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = arrow_control) +
  geom_node_point(color = "lightgreen", size = 3) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void() + theme( plot.background = element_rect(fill = "#F7F6ED")) +
  ggtitle("cons")

cons_chart

#### Overall - Cons #####

#Step 1: Unnest

data_cons <- data2 %>% select(Cons_2) %>%

unnest_tokens(words, Cons_2, token = 'ngrams',n = 2)

#Step 2: Separate

data_cons_split <- data_cons %>%

separate(words, c("from","to",sep = " ")) %>%

select(1:2)

#Step 3: Remove stopwords

data_cons_clean <- data_cons_split %>%

filter(!from %in% stop_words$word) %>%

filter(!to %in% stop_words$word)

#Step 4: Count

data_cons_counts <- data_cons_clean %>%

count(from, to)

cons_bigram <- data_cons_counts %>%

filter(n > 10) %>%

graph_from_data_frame()

arrow_control <- grid::arrow(type = "closed", length = unit(.15, "inches"))

cons_chart <- ggraph(cons_bigram) +

geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,

arrow = arrow_control) +

geom_node_point(color = "lightgreen", size = 3) +

geom_node_text(aes(label = name), vjust = 1, hjust = 1) +

theme_void() + theme( plot.background = element_rect(fill = "#F7F6ED")) +

ggtitle("cons")

cons_chart

Okay, this one is a lot more interesting. wlb also shows up in Cons, stock grants, hiring bar, and most interesting of all ‘green card’.

Let’s take a look at them one by one. My theory is that wlb must vary significantly between team/department/group. It is great that Amazonians told their title. So, let’s drill down.

###### Count wlb in Pros by Title #####
data2 %>%
  filter(str_detect(Pros_2,'wlb') == TRUE) %>%
  group_by(Title) %>%
  summarise(count = n()) %>%
  arrange(desc(count))

###### Count wlb in Pros by Title #####

data2 %>%

filter(str_detect(Pros_2,'wlb') == TRUE) %>%

group_by(Title) %>%

summarise(count = n()) %>%

arrange(desc(count))

> data2 %>%
+   filter(str_detect(Pros_2,'wlb') == TRUE) %>%
+   group_by(Title) %>%
+   summarise(count = n())
# A tibble: 43 × 2
                             Title count
                             <chr> <int>
1                Account Executive     1
2               Anonymous Employee     3
3                Applied Scientist     1
4                 Business Analyst     1
5  Client Lead Technical Recruiter     1
6           Cloud Support Engineer     2
7        Cloud Support Engineer IV     1
8     Enterprise Account Executive     1
9            Facilities Technician     1
10                  Graphic Artist     1
# ... with 33 more rows

> data2 %>%

+ filter(str_detect(Pros_2,'wlb') == TRUE) %>%

+ group_by(Title) %>%

+ summarise(count = n())

# A tibble: 43 × 2

Title count

1 Account Executive 1

2 Anonymous Employee 3

3 Applied Scientist 1

4 Business Analyst 1

5 Client Lead Technical Recruiter 1

6 Cloud Support Engineer 2

7 Cloud Support Engineer IV 1

8 Enterprise Account Executive 1

9 Facilities Technician 1

10 Graphic Artist 1

# ... with 33 more rows

wlb has been mentioned 43 times in the Pros section. Plotting without creating a new variable to group Title should not be too bad. Let’s try.

###### Plot wlb in Pros by Title #####
data2 %>%
  filter(str_detect(Pros_2,'wlb') == TRUE) %>%
  group_by(Title) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  ggplot(aes(x = Title, y= count)) + geom_bar(stat='identity') +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme_moma()

###### Plot wlb in Pros by Title #####

data2 %>%

filter(str_detect(Pros_2,'wlb') == TRUE) %>%

group_by(Title) %>%

summarise(count = n()) %>%

arrange(desc(count)) %>%

ggplot(aes(x = Title, y= count)) + geom_bar(stat='identity') +

theme(axis.text.x = element_text(angle = 45, hjust = 1)) +

theme_moma()

20 mentions from Software Engineer. Okay, what about that of Cons?

##### Count wlb in Conos by Title #####
data2 %>%
  filter(str_detect(Cons_2,'wlb') == TRUE) %>%
  group_by(Title) %>%
  summarise(count = n()) %>%
  arrange(desc(count))

##### Count wlb in Conos by Title #####

data2 %>%

filter(str_detect(Cons_2,'wlb') == TRUE) %>%

group_by(Title) %>%

summarise(count = n()) %>%

arrange(desc(count))

> ##### Count wlb in Conos by Title #####
> data2 %>%
+   filter(str_detect(Cons_2,'wlb') == TRUE) %>%
+   group_by(Title) %>%
+   summarise(count = n()) %>%
+   arrange(desc(count))
# A tibble: 178 × 2
                              Title count
                              <chr> <int>
1                Anonymous Employee    70
2     Software Development Engineer    49
3  Software Development Engineer II    32
4            Senior Product Manager    26
5   Software Development Engineer I    21
6                   Product Manager    18
7                    Senior Manager    14
8                   Program Manager    12
9      Software Development Manager    12
10                Software Engineer    11
# ... with 168 more rows

> ##### Count wlb in Conos by Title #####

> data2 %>%

+ filter(str_detect(Cons_2,'wlb') == TRUE) %>%

+ group_by(Title) %>%

+ summarise(count = n()) %>%

+ arrange(desc(count))

# A tibble: 178 × 2

Title count

1 Anonymous Employee 70

2 Software Development Engineer 49

3 Software Development Engineer II 32

4 Senior Product Manager 26

5 Software Development Engineer I 21

6 Product Manager 18

7 Senior Manager 14

8 Program Manager 12

9 Software Development Manager 12

10 Software Engineer 11

# ... with 168 more rows

178 vs. 43. Yeah, well, it seems the wlb among teams are not the same. 49 Software Development Engineers complained about wlb, but only 8 had wlb. At least 102 out 178 (or 57%) mentions are from Software Development Engineers. I used ‘at least’ because those Anonymous Employee could be the engineers.

Next what I want to see is Cons comments that have the word ‘green card.’

##### Count GC in Cons by Title #####
data2 %>%
  filter(str_detect(Cons_2,'green card') == TRUE) %>%
  group_by(Title) %>%
  summarise(count = n()) %>%
  arrange(desc(count))

##### Count GC in Cons by Title #####

data2 %>%

filter(str_detect(Cons_2,'green card') == TRUE) %>%

group_by(Title) %>%

summarise(count = n()) %>%

arrange(desc(count))

> ##### Count GC in Cons by Title #####
> data2 %>%
+   filter(str_detect(Cons_2,'green card') == TRUE) %>%
+   group_by(Title) %>%
+   summarise(count = n()) %>%
+   arrange(desc(count))
# A tibble: 7 × 2
                             Title count
                             <chr> <int>
1  Software Development Engineer I     3
2               Anonymous Employee     2
3    Software Development Engineer     2
4                             SDE1     1
5                             SDE2     1
6           Senior Product Manager     1
7 Software Development Engineer II     1

> ##### Count GC in Cons by Title #####

> data2 %>%

+ filter(str_detect(Cons_2,'green card') == TRUE) %>%

+ group_by(Title) %>%

+ summarise(count = n()) %>%

+ arrange(desc(count))

# A tibble: 7 × 2

Title count

1 Software Development Engineer I 3

2 Anonymous Employee 2

3 Software Development Engineer 2

4 SDE1 1

5 SDE2 1

6 Senior Product Manager 1

7 Software Development Engineer II 1

Hm, it’s not that much. When were these reviews submitted?

##### GC and Posted_Dates #####
data2 %>%
  filter(str_detect(Cons_2,'green card') == TRUE) %>%
  select(Posted_Date)

##### GC and Posted_Dates #####

data2 %>%

filter(str_detect(Cons_2,'green card') == TRUE) %>%

select(Posted_Date)

> ##### GC and Posted_Dates #####
> data2 %>%
+   filter(str_detect(Cons_2,'green card') == TRUE) %>%
+   select(Posted_Date)
   Posted_Date
1   2016-11-08
2   2016-09-29
3   2016-06-13
4   2016-02-02
5   2015-08-02
6   2015-07-16
7   2014-10-07
8   2014-09-11
9   2014-09-05
10  2013-01-24
11  2012-08-06

> ##### GC and Posted_Dates #####

> data2 %>%

+ filter(str_detect(Cons_2,'green card') == TRUE) %>%

+ select(Posted_Date)

Posted_Date

1 2016-11-08

2 2016-09-29

3 2016-06-13

4 2016-02-02

5 2015-08-02

6 2015-07-16

7 2014-10-07

8 2014-09-11

9 2014-09-05

10 2013-01-24

11 2012-08-06

Hm since 2012? Well, things probably would have changed since the time. I just won’t drill down anymore.